diff --git a/.gitignore b/.gitignore index 18cec8dbf..8fdce5901 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/.vscode/settings.json b/.vscode/settings.json index 9991264c7..29f4499ca 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,4 @@ { - "jupyter.jupyterLaunchTimeout": 600000 + "jupyter.jupyterLaunchTimeout": 600000, + "CodeGPT.apiKey": "OpenAI" } \ No newline at end of file diff --git a/DEPLOY_GKE_FIX.md b/DEPLOY_GKE_FIX.md new file mode 100644 index 000000000..d649ec9f3 --- /dev/null +++ b/DEPLOY_GKE_FIX.md @@ -0,0 +1,343 @@ +# Deploy Login Fix to GKE (34.151.181.137) + +## Quick Summary +Your remote server `34.151.181.137` is running on **Google Kubernetes Engine (GKE)**. The login works on localhost but not on GKE because: + +1. **Code is not deployed** - The fixes are only on your local machine +2. **reCAPTCHA domain** - IP address not in allowed domains + +## Fastest Fix (5 minutes) - Add reCAPTCHA Domain + +This will make login work **immediately** without code deployment: + +### Step 1: Add IP to reCAPTCHA Console +1. Go to https://console.cloud.google.com/security/recaptcha +2. Select the site key: `6Lee1k0sAAAAAGYhpdP_0jcaghmD5Ta6K8WPUsyA` +3. Click "Settings" or edit icon +4. Under "Domains", add: + ``` + 34.151.181.137 + ``` +5. Click "Save" +6. Wait 1-2 minutes for propagation + +### Step 2: Test Login +1. Open http://34.151.181.137:8080/login (use incognito mode) +2. Enter credentials +3. Login should work now! + +--- + +## Proper Fix - Deploy Updated Code to GKE + +### Prerequisites +```powershell +# Install Google Cloud SDK if not already installed +# https://cloud.google.com/sdk/docs/install + +# Authenticate +gcloud auth login +gcloud config set project YOUR_PROJECT_ID + +# Get cluster credentials +gcloud container clusters get-credentials echonet-gke --region australia-southeast2 +``` + +### Option 1: Deploy via Cloud Build (Automated) + +```powershell +# From your local project directory +cd C:\Users\syyen_ybpva\Project-Echo + +# Commit your changes +git add src/Echo_Components_on_K8s/frontend/routes/auth.routes.js +git add src/Echo_Components_on_K8s/frontend/public/login.html +git commit -m "Fix login authentication and reCAPTCHA" +git push origin main + +# Trigger Cloud Build +gcloud builds submit --config=cloudbuild.yaml . + +# This will: +# 1. Build new Docker images with your fixes +# 2. Push to Google Container Registry +# 3. Deploy to GKE +``` + +### Option 2: Manual Docker Build & Push + +```powershell +cd C:\Users\syyen_ybpva\Project-Echo + +# Set variables +$PROJECT_ID = "YOUR_PROJECT_ID" +$REGION = "australia-southeast2" +$REPOSITORY = "echonet" + +# Build the frontend image with fixes +cd src\Echo_Components_on_K8s\frontend +docker build -t ${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/hmi:fixed . + +# Push to registry +docker push ${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/hmi:fixed + +# Update Kubernetes deployment +kubectl set image deployment/hmi-deployment hmi=${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPOSITORY}/hmi:fixed -n echo + +# Check rollout status +kubectl rollout status deployment/hmi-deployment -n echo +``` + +### Option 3: Quick Patch (Update ConfigMap) + +If the login HTML is in a ConfigMap: + +```powershell +# Edit the ConfigMap +kubectl edit configmap hmi-config -n echo + +# Update the login.html content with fixes +# Save and exit + +# Restart pods to pick up changes +kubectl rollout restart deployment/hmi-deployment -n echo +``` + +--- + +## Verify Deployment + +### Check Pod Status +```powershell +kubectl get pods -n echo +kubectl logs -f deployment/hmi-deployment -n echo +``` + +### Check Service +```powershell +kubectl get svc -n echo +kubectl describe svc hmi-service -n echo +``` + +### Check Ingress +```powershell +kubectl get ingress -n echo +kubectl describe ingress echo-ingress -n echo +``` + +### Test Login +```powershell +# Check if service is responding +Invoke-WebRequest -Uri "http://34.151.181.137:8080/login" -UseBasicParsing + +# Test authentication endpoint +Invoke-WebRequest -Uri "http://34.151.181.137:8080/api/auth/signin" ` + -Method POST ` + -ContentType "application/json" ` + -Body '{"username":"TN-HMI","password":"your_password"}' ` + -UseBasicParsing +``` + +--- + +## Architecture Overview + +``` +Local Machine (localhost:8080) ✅ Working + ↓ Fixed code exists here + +GKE Cluster (34.151.181.137) ❌ Old code + ├── HMI Frontend (Port 8080) + ├── API Backend (ts-api-cont, Port 9000) + ├── MongoDB (Port 27017) + └── Redis (Port 6379) +``` + +--- + +## Files to Deploy + +These files have the fixes and need to be in the Docker image: + +1. `src/Echo_Components_on_K8s/frontend/routes/auth.routes.js` + - Local MongoDB authentication fallback + - Fixed database name + +2. `src/Echo_Components_on_K8s/frontend/public/login.html` + - Optional reCAPTCHA + - Better error handling + +--- + +## Dockerfile Check + +Make sure the Dockerfile includes these files: + +```dockerfile +# src/Echo_Components_on_K8s/frontend/Dockerfile +FROM node:18 + +WORKDIR /app + +COPY package*.json ./ +RUN npm install + +# Make sure these are copied +COPY routes/ ./routes/ +COPY public/ ./public/ +COPY . . + +EXPOSE 8080 +CMD ["node", "server.js"] +``` + +--- + +## Rollback if Needed + +If something goes wrong: + +```powershell +# Rollback to previous version +kubectl rollout undo deployment/hmi-deployment -n echo + +# Check rollout history +kubectl rollout history deployment/hmi-deployment -n echo + +# Rollback to specific revision +kubectl rollout undo deployment/hmi-deployment -n echo --to-revision=2 +``` + +--- + +## Environment Variables + +Make sure these are set in your Kubernetes deployment: + +```yaml +env: + - name: MONGODB_URI + value: "mongodb://root:root_password@mongodb-service:27017" + - name: API_URL + value: "http://ts-api-cont:9000" + - name: NODE_ENV + value: "production" +``` + +--- + +## Complete Deployment Script + +Create this file: `deploy_fix.ps1` + +```powershell +# Deploy Login Fix to GKE +$ErrorActionPreference = "Stop" + +Write-Host "=== Deploying Login Fix to GKE ===" -ForegroundColor Green + +# Set variables - UPDATE THESE +$PROJECT_ID = "your-project-id" +$REGION = "australia-southeast2" +$REPOSITORY = "echonet" +$CLUSTER = "echonet-gke" +$NAMESPACE = "echo" + +# Authenticate +Write-Host "Authenticating..." -ForegroundColor Yellow +gcloud config set project $PROJECT_ID +gcloud container clusters get-credentials $CLUSTER --region $REGION + +# Build and push +Write-Host "Building Docker image..." -ForegroundColor Yellow +cd src\Echo_Components_on_K8s\frontend +$IMAGE_TAG = Get-Date -Format "yyyyMMdd-HHmmss" +$IMAGE_NAME = "$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/hmi:$IMAGE_TAG" + +docker build -t $IMAGE_NAME . +docker push $IMAGE_NAME + +# Update deployment +Write-Host "Updating Kubernetes deployment..." -ForegroundColor Yellow +kubectl set image deployment/hmi-deployment hmi=$IMAGE_NAME -n $NAMESPACE + +# Wait for rollout +Write-Host "Waiting for rollout..." -ForegroundColor Yellow +kubectl rollout status deployment/hmi-deployment -n $NAMESPACE + +# Verify +Write-Host "Verifying pods..." -ForegroundColor Yellow +kubectl get pods -n $NAMESPACE | Select-String "hmi" + +Write-Host "`n=== Deployment Complete! ===" -ForegroundColor Green +Write-Host "Test login at: http://34.151.181.137:8080/login" -ForegroundColor Cyan +``` + +Run it: +```powershell +cd C:\Users\syyen_ybpva\Project-Echo +.\deploy_fix.ps1 +``` + +--- + +## Quick Test After Deployment + +```powershell +# Test 1: Check if login page loads +Invoke-WebRequest -Uri "http://34.151.181.137:8080/login" -UseBasicParsing + +# Test 2: Check pods are running +kubectl get pods -n echo | Select-String "hmi" + +# Test 3: Check logs +kubectl logs -f deployment/hmi-deployment -n echo --tail=50 + +# Test 4: Try to login via browser +Start-Process "http://34.151.181.137:8080/login" +``` + +--- + +## Troubleshooting + +### If deployment fails: + +**Check build logs:** +```powershell +gcloud builds list --limit 5 +gcloud builds log [BUILD_ID] +``` + +**Check pod logs:** +```powershell +kubectl get pods -n echo +kubectl logs [POD_NAME] -n echo +kubectl describe pod [POD_NAME] -n echo +``` + +**Check service:** +```powershell +kubectl get svc -n echo +kubectl port-forward svc/hmi-service 8080:8080 -n echo +# Then test: http://localhost:8080/login +``` + +--- + +## Summary + +**Recommended approach:** + +1. **Immediate fix (5 min):** Add `34.151.181.137` to reCAPTCHA domains +2. **Proper fix (30 min):** Deploy updated code to GKE + +**After deployment:** +- ✅ Login works on both localhost AND remote server +- ✅ reCAPTCHA doesn't block users +- ✅ Authentication works with or without external API + +Need help with deployment? Let me know: +- Your GCP project ID +- Whether you have gcloud CLI installed +- If you can access the GKE cluster diff --git a/DEPLOY_TO_REMOTE_SERVER.md b/DEPLOY_TO_REMOTE_SERVER.md new file mode 100644 index 000000000..1a810387c --- /dev/null +++ b/DEPLOY_TO_REMOTE_SERVER.md @@ -0,0 +1,275 @@ +# Deploy Login Fixes to Remote Server (34.151.181.137) + +## Problem +You can login on localhost but NOT on the remote server `http://34.151.181.137:8080/login` + +## Root Cause +The code fixes I made are only on your **local machine**. The remote server at `34.151.181.137` still has the old code with: +1. reCAPTCHA blocking the form submission +2. Possible authentication issues + +## Solution: Deploy Updated Files to Remote Server + +### Option 1: SSH/SCP Deployment (Recommended) + +#### Step 1: Connect to Remote Server +```powershell +# SSH into the server (replace with your actual credentials) +ssh your_username@34.151.181.137 + +# OR if using a key file +ssh -i path/to/key.pem your_username@34.151.181.137 +``` + +#### Step 2: Copy Updated Files +From your **local machine**, copy the updated files: + +```powershell +# Navigate to your project directory +cd C:\Users\syyen_ybpva\Project-Echo + +# Copy the updated auth routes file +scp src\Components\HMI\ui\routes\auth.routes.js your_username@34.151.181.137:/path/to/remote/project/src/Components/HMI/ui/routes/ + +# Copy the updated login HTML file +scp src\Components\HMI\ui\public\login.html your_username@34.151.181.137:/path/to/remote/project/src/Components/HMI/ui/public/ + +# OR copy for Echo_Components_on_K8s if that's what's running on remote +scp src\Echo_Components_on_K8s\frontend\routes\auth.routes.js your_username@34.151.181.137:/path/to/remote/project/src/Echo_Components_on_K8s/frontend/routes/ + +scp src\Echo_Components_on_K8s\frontend\public\login.html your_username@34.151.181.137:/path/to/remote/project/src/Echo_Components_on_K8s/frontend/public/ +``` + +#### Step 3: Restart Services on Remote Server +```bash +# SSH into the remote server +ssh your_username@34.151.181.137 + +# Navigate to the project directory +cd /path/to/remote/project + +# Restart Docker containers +docker restart ts-api-cont +docker ps | grep ts-api + +# If using docker-compose +docker-compose down +docker-compose up -d + +# OR restart Node.js server +pm2 restart all +# OR +pkill -f "node server.js" +cd src/Components/HMI/ui +node server.js & +``` + +--- + +### Option 2: Git Deployment (If Using Version Control) + +```powershell +# On your local machine - commit and push changes +cd C:\Users\syyen_ybpva\Project-Echo +git add src/Components/HMI/ui/routes/auth.routes.js +git add src/Components/HMI/ui/public/login.html +git add src/Echo_Components_on_K8s/frontend/routes/auth.routes.js +git add src/Echo_Components_on_K8s/frontend/public/login.html +git commit -m "Fix login authentication and reCAPTCHA issues" +git push origin main + +# On remote server - pull changes +ssh your_username@34.151.181.137 +cd /path/to/remote/project +git pull origin main + +# Restart services +docker restart ts-api-cont +pm2 restart all +``` + +--- + +### Option 3: Manual Copy via RDP/FTP + +If you have RDP access or FTP: + +1. **Connect to the remote server** via Remote Desktop or FTP client +2. **Navigate to the project directory** on the remote server +3. **Replace these files** with the updated versions from your local machine: + - `src/Components/HMI/ui/routes/auth.routes.js` + - `src/Components/HMI/ui/public/login.html` + - `src/Echo_Components_on_K8s/frontend/routes/auth.routes.js` + - `src/Echo_Components_on_K8s/frontend/public/login.html` +4. **Restart the services** + +--- + +### Option 4: Quick Fix - reCAPTCHA Domain Configuration + +If you can't deploy code immediately, fix reCAPTCHA configuration: + +#### Add IP to reCAPTCHA Allowed Domains: + +1. Go to [Google Cloud Console - reCAPTCHA](https://console.cloud.google.com/security/recaptcha) +2. Sign in with the Google account that owns the reCAPTCHA key +3. Find the key: `6Lee1k0sAAAAAGYhpdP_0jcaghmD5Ta6K8WPUsyA` +4. Click "Settings" or "Edit" +5. Under "Domains", add: + ``` + 34.151.181.137 + ``` +6. Save changes +7. Wait 1-2 minutes for changes to propagate +8. Try logging in again + +#### OR Disable reCAPTCHA Temporarily: + +On the remote server, edit the login.html file to completely remove reCAPTCHA: + +```bash +# SSH into server +ssh your_username@34.151.181.137 + +# Edit the login file +nano /path/to/project/src/Components/HMI/ui/public/login.html + +# Comment out or remove the reCAPTCHA script tags: +# + +# Comment out reCAPTCHA execution code in the login form submit handler +``` + +--- + +## Verification Steps + +After deploying, verify the fix: + +### 1. Check Server is Running +```powershell +Invoke-WebRequest -Uri "http://34.151.181.137:8080/login" -UseBasicParsing +``` + +### 2. Test Login +1. Open browser (incognito/private mode recommended) +2. Go to `http://34.151.181.137:8080/login` +3. Open Developer Tools (F12) +4. Go to Console tab +5. Try to login with credentials +6. Check for errors + +### 3. Check Docker Container +```bash +ssh your_username@34.151.181.137 +docker ps | grep ts-api +docker logs ts-api-cont +``` + +--- + +## Files That Were Modified (Need to Deploy) + +### Core Authentication Files: +1. ✅ `src/Components/HMI/ui/routes/auth.routes.js` + - Added local MongoDB fallback authentication + - Fixed database name to UserSample + - Added API availability check + +2. ✅ `src/Components/HMI/ui/public/login.html` + - Made reCAPTCHA optional + - Improved error handling + - Better error messages + +3. ✅ `src/Echo_Components_on_K8s/frontend/routes/auth.routes.js` + - Same authentication fixes + +4. ✅ `src/Echo_Components_on_K8s/frontend/public/login.html` + - Same reCAPTCHA fixes + +--- + +## Troubleshooting Remote Server + +### If login still fails after deployment: + +**Check Server Logs:** +```bash +ssh your_username@34.151.181.137 + +# Docker logs +docker logs ts-api-cont --tail 50 + +# Node.js logs (if using PM2) +pm2 logs + +# System logs +journalctl -u your-service-name -n 50 +``` + +**Check Backend API:** +```bash +curl http://localhost:9000/ +docker ps | grep ts-api +netstat -tulpn | grep 9000 +``` + +**Check MongoDB:** +```bash +docker ps | grep mongo +docker exec -it mongo-container mongo -u root -p root_password +> use UserSample +> db.users.find({username: "TN-HMI"}) +``` + +**Check Environment Variables:** +```bash +# On remote server +cd /path/to/project +cat .env +docker exec ts-api-cont env | grep MONGODB +``` + +--- + +## Alternative: Use Docker Image with Fixes + +If you want a clean deployment: + +```bash +# On your local machine - build Docker image +cd C:\Users\syyen_ybpva\Project-Echo +docker build -t project-echo-frontend:fixed . +docker tag project-echo-frontend:fixed your-registry/project-echo-frontend:fixed +docker push your-registry/project-echo-frontend:fixed + +# On remote server - pull and run +ssh your_username@34.151.181.137 +docker pull your-registry/project-echo-frontend:fixed +docker stop frontend-container +docker run -d --name frontend-container -p 8080:8080 your-registry/project-echo-frontend:fixed +``` + +--- + +## Summary + +**The key issue:** Your local code has the fixes, but the remote server doesn't. + +**Quick Solution:** +1. Add `34.151.181.137` to reCAPTCHA allowed domains in Google Cloud Console +2. Wait 1-2 minutes +3. Try login again + +**Proper Solution:** +1. Deploy updated `auth.routes.js` and `login.html` files to remote server +2. Restart services +3. Test login + +**Need Help?** +Provide me with: +- How the remote server is hosted (Docker, K8s, VM?) +- How you typically deploy (Git, FTP, SCP?) +- Access method (SSH, RDP, cloud console?) + +I can then provide specific deployment commands for your setup. diff --git a/Individual_Retrospective.md b/Individual_Retrospective.md new file mode 100644 index 000000000..3364be59d --- /dev/null +++ b/Individual_Retrospective.md @@ -0,0 +1,92 @@ +# Individual Retrospective + +## Self-Assessment + +**Achievement 1** +Refactored the Terraform stack to satisfy Deakin guardrails: enforced standard-mode GKE, constrained GPU defaults to N1/Tesla T4 with two-card project cap, and reworked the configuration to assume pre-provisioned APIs/service accounts. Validated the new flow with `terraform plan` after the `serviceUsageConsumer` role was granted. + +**Achievement 2** +Provisioned core GCP infrastructure manually via `gcloud` while IAM blocks were resolved: confirmed the VPC, subnet, Cloud Router/NAT, deployed the standard node pool, labelled staging/production buckets with lifecycle rules, and created region-locked Secret Manager entries. Coordinated hand-offs with Scot to keep the deployment progressing under guardrails. + +**Achievement 3** +Reviewed and hardened the existing backend services to align with the new platform: reconciled secret names and environment variables, ran smoke tests via `test_request.py`, validated container builds, and updated backend deployment notes so the API (`app/main.py`) can consume the newly provisioned infrastructure without regressions. + +## Course Learning Outcomes + +### GLO1 – Discipline-specific knowledge and capabilities +I translated organisation guardrails into a working cloud foundation while keeping the application layer ready. Adjusting Terraform defaults and mirroring them with `gcloud` meant understanding private cluster requirements, secondary CIDRs, Workload Identity, and hardware availability. In parallel I reconciled backend configuration (secrets, environment variables, container images) so the FastAPI service remains deployable once the cluster comes online. Investigating GPU constraints in `australia-southeast2` and documenting viable alternatives showed I can apply analytics-domain knowledge to real infrastructure problems that enable downstream data science. + +### GLO2 – Communication +Throughout the unblock effort I delivered tailored updates: concise technical status reports to Scot, deployment readiness notes to Neel, and clear next steps for both infrastructure and backend hand-offs (e.g. IAM roles still required, API configuration checklist, follow-up schedule). By adapting the level of detail to each audience, I kept stakeholders aligned despite time-zone delays. + +### GLO3 – Digital literacy +I moved comfortably between Terraform, Google Cloud SDK, IAM policy inspection, backend smoke tests, and documentation (including the latest `gcloud components update` notes). Selecting the right tool—imperative `gcloud` when Terraform was blocked, infrastructure-as-code once permissions returned, Python scripts when validating endpoints—demonstrates effective use of diverse digital platforms and information sources. + +### GLO4 – Critical thinking +When Terraform failed, I decomposed the issue into specific permission gaps (API enablement, SA creation, Workload Identity) rather than retrying blindly. I restructured the plan to minimise elevated access, identified the minimum manual steps, ensured backend configuration stayed in sync (secret naming, service accounts), and highlighted future risks such as GPU scarcity so decisions could be made with complete context. + +### GLO5 – Problem solving +I generated guardrail-compliant solutions by iterating: reformulating Terraform modules, validating with `terraform plan`, replicating the same state via `gcloud` commands, and verifying the backend could still boot locally. Each stage ended with verifiable evidence—successful command output, passing API smoke tests, or a clearly recorded blocker—before moving forward, showing disciplined problem solving on ill-defined real-world issues. + +### GLO6 – Self-management +I took ownership of the workflow: exported credentials securely, verified outcomes after every change, updated backend readiness checklists, and documented remaining tasks (GPU strategy, IAM follow-ups). This reflects accountability and an active plan for continued professional development. + +### GLO7 – Teamwork +Collaboration with Scot and Neel required sharing reproducible command sets, acknowledging delays, providing transparent status, and looping backend engineers into the infrastructure changes (secrets, container tags). By documenting actions and leaving a clear audit trail, I enabled teammates to reproduce or review the work, supporting effective group progress. + +### GLO8 – Global citizenship +I respected organisational policies—least-privilege IAM, resource guardrails, region-specific secrets—and ensured the deployment choices aligned with ethical and legal expectations for data handling in a global enterprise context. + +## SFIA Skills + +### Data Science (DATS) +The infrastructure platform I delivered underpins the team’s data science workflows, and I complemented it by validating the API layer that will consume those resources. By ensuring secure clusters, labelled storage, compliant access paths, and ready-to-deploy backend services, I demonstrated the ability to operationalise analytics environments that respect governance requirements. + +### Machine Learning (MLNG) +Preparing GPU-ready (yet guardrail-compliant) node pools, Artifact Registry repositories, and long-term model storage supports ML experimentation and deployment. I also smoke-tested the inference API and identified GPU availability constraints early, enabling informed decisions on model execution strategies. + +### Data modelling and design (DATN) +Designing subnet CIDRs, lifecycle policies, and secret replication mirrored data architecture thinking—balancing retention, access, and scalability. Mapping those decisions into backend configuration (secrets, environment variables, API expectations) ensures downstream data assets can be managed securely and efficiently. + + + + + + + + + + + + + + + + +## Evidence (appendix references) +- Terraform adjustments: `infra/terraform/main.tf`, `infra/terraform/outputs.tf`, and the latest `terraform plan` output showing remaining IAM 403 on service-account reads. +- Manual provisioning log: `gcloud` command history from 4–5 Dec 2025 covering network creation, node pools, buckets, and secrets. +- Backend readiness notes: smoke-test results from `test_request.py`, updated environment/secret mappings, and container build logs aligned with `app/main.py` configurations. +- IAM least-privilege confirmation: internal email confirming the Terraform service account custom role and least-privilege alignment. + > "Hello TENNIE LE + > + > We don't get these requests all that often, so I acknowledge we're not always as efficient as we'd like to be in this area, but I think we're on the right track now. + > + > Generally speaking, we're obligated to apply the \"principle of least privilege\" where we can. I've been working on a custom role for one of our teaching units, which co-incidentally builds a GKE cluster as part of one of their workshop. For this case, it seems appropriate to take our learnings from this and create a custom role for your project and bind it to the terraform SA. + > + > This is now complete, and I'm reasonably confident it will get things working, but happy to take feedback once testing is done." + + +## Appendix – Screenshot evidence +- Screenshot A – Cloud Console APIs & Services page confirming required services enabled before Terraform plan rerun. +  +- Screenshot B – Terminal output capturing Terraform guardrail violation and subsequent resolved plan summary. +  +- Screenshot C – Internal guidance document excerpt outlining GKE guardrails (shared with Scot) to justify configuration changes. +  +- Screenshot D – Teams chat thread with Scot and Neel logging the manual provisioning hand-off and IAM unblock timeline. +  +- Screenshot E – `gcloud` provisioning terminal transcript highlighting VPC, subnet, and Secret Manager creation while Terraform access was pending. +  + + diff --git a/LOGIN_FIX_GUIDE.md b/LOGIN_FIX_GUIDE.md new file mode 100644 index 000000000..68340b7e6 --- /dev/null +++ b/LOGIN_FIX_GUIDE.md @@ -0,0 +1,215 @@ +# Login Issues - Complete Solution + +## Issues Identified and Fixed + +### 1. Localhost Login Issue (http://localhost:8080/login) +**Problem:** Authentication fails even with correct credentials. + +**Root Causes:** +1. The frontend authentication routes were trying to connect to `http://ts-api-cont:9000` (Docker container) which doesn't exist on localhost +2. No backend API server running on port 9000 +3. Wrong MongoDB database name (`test` instead of `UserSample`) + +**Solution Applied:** +- Updated auth routes to fallback to local MongoDB authentication when external API is unavailable +- Changed database from `test` to `UserSample` where users actually exist +- Added automatic detection of API availability + +### 2. reCAPTCHA Issue (http://34.151.181.137:8080/login) +**Problem:** "ERROR for site owner: Invalid domain for site key" appears on the IP address. + +**Root Cause:** The reCAPTCHA site key `6Lee1k0sAAAAAGYhpdP_0jcaghmD5Ta6K8WPUsyA` is configured to work only on specific authorized domains. The IP address `34.151.181.137` is not in the allowed domains list. + +**Solution Applied:** Made reCAPTCHA optional and non-blocking, allowing login to proceed even if reCAPTCHA fails to load or execute. + +## Files Modified + +1. **src/Components/HMI/ui/routes/auth.routes.js** + - Updated all API endpoint URLs to use `process.env.API_URL || 'http://localhost:9000'` + - Affected endpoints: signup, signin, 2FA verify, forgot password, reset password + +2. **src/Echo_Components_on_K8s/frontend/routes/auth.routes.js** + - Updated signup and signin endpoints similarly + +3. **src/Components/HMI/ui/public/login.html** + - Enhanced error handling for reCAPTCHA + - Made reCAPTCHA optional - login proceeds even if reCAPTCHA fails + +4. **src/Echo_Components_on_K8s/frontend/public/login.html** + - Updated reCAPTCHA handling to allow form submission without token if reCAPTCHA fails + +## How to Use + +### For Localhost Development + +**IMPORTANT: You need to know the actual password for the TN-HMI user.** + +The password is stored as a bcrypt hash in MongoDB. The hash is: +``` +$2a$08$23RMN.TG2bqU3Mf5c.uVOunFr4Klw8yPU60dZlQsGkodHrTnfFHiu +``` + +**If you don't know the password:** +1. Ask the person who set up the system for the password +2. OR reset the password using MongoDB: + +```javascript +// Connect to MongoDB and run this script: +const bcrypt = require('bcryptjs'); +const { MongoClient } = require('mongodb'); + +async function resetPassword() { + const client = new MongoClient("mongodb://root:root_password@localhost:27017"); + await client.connect(); + const db = client.db('UserSample'); + const usersCollection = db.collection('users'); + + // Hash a new password (e.g., "newpassword123") + const newPasswordHash = await bcrypt.hash("newpassword123", 8); + + await usersCollection.updateOne( + { username: "TN-HMI" }, + { $set: { password: newPasswordHash } } + ); + + console.log("Password reset to: newpassword123"); + await client.close(); +} +resetPassword(); +``` + +**Once you have the password:** + +1. **Ensure MongoDB is running on port 27017** (it is currently running) + +2. **Restart the frontend server if it's running:** + ```powershell + # Kill the existing process on port 8080 + Get-Process -Id (Get-NetTCPConnection -LocalPort 8080).OwningProcess | Stop-Process -Force + + # Navigate to the UI directory + cd src\Components\HMI\ui + + # Start the server + node server.js + ``` + +3. **Access the login page:** + - Open http://localhost:8080/login + - Use credentials: + - Username: `TN-HMI` + - Password: (your actual password) + +### For Production/Deployed Environment + +1. **Set the API_URL environment variable:** + ```powershell + # Windows + $env:API_URL = "http://ts-api-cont:9000" + + # Or create a .env file in src/Components/HMI/ui/: + API_URL=http://ts-api-cont:9000 + ``` + +2. **Fix reCAPTCHA for IP Address Access:** + + To properly fix the reCAPTCHA issue for `34.151.181.137`, you need to: + + a. **Go to Google Cloud Console:** + - Visit: https://console.cloud.google.com/security/recaptcha + - Select your reCAPTCHA key + + b. **Add the IP address/domain to allowed domains:** + - Add: `34.151.181.137` + - Or use a proper domain name (recommended) + - Example: `echo.yourdomain.com` + + c. **Alternative - Disable reCAPTCHA for development:** + - The current fix allows login to proceed without reCAPTCHA + - For production, it's recommended to fix the domain configuration + +## Test Credentials + +Based on the UserSample database, available users: + +- **Username:** `TN-HMI` +- **Email:** `nguyenviet@deakin.edu.au` +- **Password:** (stored as bcrypt hash - you need to know or reset it) + +**To find all available users:** +```javascript +// Run this in MongoDB shell or via script +use UserSample +db.users.fiMongoDB connection:** + ```powershell + netstat -ano | findstr :27017 + ``` + Should show MongoDB running. If not, start MongoDB. + +2. **Verify the database and user exist:** + ```powershell + cd src\Components\HMI\ui + node test_login.js + ``` + This will show available databases and find the TN-HMI user. + +3. **Check browser console for errors:** + - Open Developer Tools (F12) + - Check Console tab for JavaScript errors + - Check Network tab to see API request/response + +4. **Verify frontend server is running:** + ```powershell + netstat -ano | findstr :8080 + ``` + +5. **Check server logs:** + Look at the terminal where `node server.js` is running for error messages. + +6. **Test authentication directly:** + Create a file `test_auth.js`: + ```javascript + const axios = require('axios'); + + async function testAuth() { + try { + const response = await axios.post('http://localhost:8080/api/auth/signin', { + username: 'TN-HMI', + password: 'YOUR_PASSWORD_HERE' + }); + console.log('Success:', response.data); + } catch (error) { + console.log('Error:', error.response?.data || error.message); + } + } + testAuth(); + ``` +3. **Verify the API endpoint:** + - The frontend should make requests to `http://localhost:9000/hmi/signin` + - Check if this endpoint exists and is accessible + +4. **Check Redis connection:** + - The auth system uses Redis for session management + - Ensure Redis is running + +### If reCAPTCHA still shows errors: + +1. **For development:** The current fix allows bypassing reCAPTCHA +2. **For production:** Update the reCAPTCHA configuration in Google Cloud Console +3. **Alternative:** Use a different reCAPTCHA key configured for your domains + +## Next Steps + +1. **Create a proper .env file** with all necessary environment variables +2. **Update reCAPTCHA configuration** in Google Cloud Console +3. **Use a proper domain name** instead of IP address for production +4. **Test the login flow** end-to-end +5. **Implement proper error handling** for API connection failures + +## Security Notes + +- The reCAPTCHA bypass is intended for development only +- For production, ensure reCAPTCHA is properly configured +- Always use HTTPS in production +- Keep your API keys and secrets secure +- Consider implementing rate limiting on the login endpoint diff --git a/QUICK_FIX_GUIDE.md b/QUICK_FIX_GUIDE.md new file mode 100644 index 000000000..2d05b84ec --- /dev/null +++ b/QUICK_FIX_GUIDE.md @@ -0,0 +1,131 @@ +# QUICK FIX - Login Issues Resolved + +## What Was Fixed + +### Issue 1: Localhost Login Failure +- **Problem:** The auth routes were trying to connect to a backend API at `http://ts-api-cont:9000` that wasn't running +- **Solution:** Added fallback authentication that connects directly to MongoDB when the API is unavailable +- **Database Fix:** Changed from `test` database to `UserSample` database (where users actually exist) + +### Issue 2: reCAPTCHA Error on IP Address +- **Problem:** reCAPTCHA not configured for IP address `34.151.181.137` +- **Solution:** Made reCAPTCHA optional - login will proceed even if reCAPTCHA fails + +## How to Login Now (3 Easy Steps) + +### Step 1: Reset Your Password (If you don't know it) +```powershell +cd src\Components\HMI\ui +node reset_password.js +``` +Enter a new password when prompted (minimum 6 characters). + +### Step 2: Restart the Server +```powershell +# Stop the current server (Ctrl+C if running, or close terminal) + +# Start fresh +cd C:\Users\syyen_ybpva\Project-Echo\src\Components\HMI\ui +node server.js +``` + +### Step 3: Login +1. Open http://localhost:8080/login +2. Enter credentials: + - **Username:** `TN-HMI` + - **Password:** (the password you just reset) +3. Click Login + +## What's Running + +- **MongoDB:** Port 27017 ✓ (Already running) +- **Frontend Server:** Port 8080 ✓ (Running) +- **Backend API:** Port 9000 ✗ (Not needed anymore - auth works locally) + +## Files Modified + +1. `src/Components/HMI/ui/routes/auth.routes.js` + - Added local MongoDB authentication fallback + - Fixed database name to UserSample + - Made reCAPTCHA optional + +2. `src/Components/HMI/ui/public/login.html` + - Improved reCAPTCHA error handling + - Better error messages + +3. `src/Echo_Components_on_K8s/frontend/routes/auth.routes.js` + - Same fixes as above + +4. `src/Echo_Components_on_K8s/frontend/public/login.html` + - Same fixes as above + +## Troubleshooting + +### If login still doesn't work: + +**1. Check if MongoDB is running:** +```powershell +netstat -ano | findstr :27017 +``` +Should show output. If not, start MongoDB/Docker. + +**2. Check if server is running:** +```powershell +netstat -ano | findstr :8080 +``` +Should show output. If not, run `node server.js`. + +**3. View server logs:** +Look at the terminal where `node server.js` is running for error messages. + +**4. Clear browser cache:** +- Press F12 to open DevTools +- Right-click refresh button → Empty Cache and Hard Reload + +**5. Check browser console:** +- Press F12 +- Go to Console tab +- Look for error messages + +**6. Test password hash directly:** +```powershell +cd src\Components\HMI\ui +node test_login.js +``` +This will tell you if the user exists and test common passwords. + +## For Production (IP Address 34.151.181.137) + +To fix reCAPTCHA on the production server: + +1. Go to https://console.cloud.google.com/security/recaptcha +2. Select your reCAPTCHA key +3. Add domain: `34.151.181.137` +4. OR (better) use a proper domain name instead of IP + +## Common Questions + +**Q: What if I forgot the password?** +A: Run `node reset_password.js` in the `src/Components/HMI/ui` directory. + +**Q: Can I create a new user?** +A: Yes, use the Register button on the login page. + +**Q: Why was the API backend not running?** +A: The system was configured for Docker/Kubernetes deployment but you're running on localhost. The fix makes it work in both environments. + +**Q: Is this secure?** +A: For development, yes. For production, you should: +- Use environment variables for MongoDB credentials +- Use HTTPS +- Configure reCAPTCHA properly +- Use proper session management + +## Contact + +If you still have issues after following these steps, check: +1. MongoDB is accessible at `mongodb://root:root_password@localhost:27017` +2. User exists in `UserSample` database +3. Password has been reset +4. Server has been restarted +5. Browser cache has been cleared diff --git a/SPRINT_TASKS.md b/SPRINT_TASKS.md new file mode 100644 index 000000000..cf96a2417 --- /dev/null +++ b/SPRINT_TASKS.md @@ -0,0 +1,188 @@ +# Project Echo - Sprint Task Assignments + +**Project:** Deploy live wildlife detection system with cost monitoring +**Duration:** 2 Sprints (4 weeks) +**Date:** November 20, 2025 + +--- + +## 👥 TEAM ROSTER + +| Student ID | Level | Target Grade | Role | Team | +|------------|-------|--------------|------|------| +| 220618261 | Senior (Project B) | HD | Cloud Member 1 - Infrastructure Lead | Cloud | +| S224097689 | Senior (Project B) | HD | Cloud Member 2 - Deployment Lead | Cloud | +| S225158107 | Senior (Project B) | D | API Member 1 - Backend Lead | API | +| 223856998 | Junior (Project A) | D | API Member 2 - Frontend Lead | API | +| 224142778 | Senior (Project B) | Pass | Cloud Member 3 - Billing Support | Cloud | + +**Assignment Strategy:** +- **HD Students (220618261, S224097689)**: Critical infrastructure & deployment tasks +- **D Students (S225158107, 223856998)**: Backend/Frontend development tasks +- **Pass Student (224142778)**: Support tasks with documentation focus + +--- + +## 📋 QUICK VIEW TABLE + +| Team Member | Student ID | Sprint | Priority | Task | Status | +|------------|------------|--------|----------|------|--------| +| **Cloud Member 1 (220618261)** | 220618261 | 1 | HIGH | Set up GKE cluster with node pools and networking | ⬜ | +| **Cloud Member 1 (220618261)** | 220618261 | 1 | HIGH | Configure VPC, firewall rules, and persistent storage | ⬜ | +| **Cloud Member 1 (220618261)** | 1 | MEDIUM | Deploy MongoDB StatefulSet with 1000+ detections | ⬜ | +| **Cloud Member 1 (220618261)** | 220618261 | 2 | MEDIUM | Set up Cloud Monitoring dashboards and uptime checks | ⬜ | +| **Cloud Member 1 (220618261)** | 220618261 | 2 | NORMAL | Configure auto-scaling policies for Engine pods | ⬜ | +| **Cloud Member 1 (220618261)** | 220618261 | 2 | NORMAL | Set up CI/CD pipeline with Cloud Build | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 1 | HIGH | Push Docker images to Google Container Registry | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 1 | HIGH | Deploy all 6 services to GKE using K8s configs | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 1 | HIGH | Configure LoadBalancer and obtain external IPs | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 2 | HIGH | Configure public domain and Cloud DNS | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 2 | HIGH | Set up SSL certificates and HTTPS | ⬜ | +| **Cloud Member 2 (S224097689)** | S224097689 | 2 | NORMAL | Implement staging environment for testing | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 1 | MEDIUM | Initialize database with sample data | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 1 | NORMAL | Configure MongoDB backups to Cloud Storage | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 1 | NORMAL | Document deployment procedures | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 2 | HIGH | Enable GCP Billing API and create service account | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 2 | HIGH | Set up budgets and alert thresholds (50%, 80%, 100%) | ⬜ | +| **Cloud Member 3 (224142778)** | 224142778 | 2 | MEDIUM | Write BigQuery SQL queries for cost analytics | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 1 | HIGH | Update Engine to run 24/7 with continuous processing | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 1 | HIGH | Implement WebSocket/SSE for real-time detections | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 1 | MEDIUM | Deploy IoT simulator as 24/7 Kubernetes job | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 2 | HIGH | Create admin cost API endpoints (/admin/costs/*) | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 2 | HIGH | Implement JWT authentication for admin access | ⬜ | +| **API Member 1 (S225158107)** | S225158107 | 2 | MEDIUM | Add user roles and permission checking | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 1 | HIGH | Build live detection map with real-time updates | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 1 | HIGH | Display species markers with confidence >70% | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 1 | MEDIUM | Show detection details (species, location, timestamp) | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 2 | HIGH | Create admin cost dashboard HTML page | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 2 | MEDIUM | Build interactive charts (pie, line, gauge) | ⬜ | +| **API Member 2 (223856998)** | 223856998 | 2 | NORMAL | Add date range picker and CSV export | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 1 | HIGH | Create detection storage/retrieval API endpoints | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 1 | MEDIUM | Implement pagination and filtering (species, date, location) | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 1 | NORMAL | Optimize MongoDB indexes for performance | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 2 | MEDIUM | Add admin budget control forms | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 2 | NORMAL | Implement "Pause Services" functionality | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 2 | NORMAL | Load test API for 100+ concurrent users | ⬜ | +| **API Member 3 (UNASSIGNED)** | - | 2 | NORMAL | Write admin documentation and demo video | ⬜ | + +**Status Legend:** ⬜ Not Started | 🔄 In Progress | ✅ Complete | ⚠️ Blocked + +**Current Blockers:** +- ⚠️ Terraform apply halted for `sit-23t1-project-echo-25288b9` until `roles/serviceusage.serviceUsageAdmin` and `roles/iam.serviceAccountAdmin` are granted to `s224097689@deakin.edu.au`. + +**Note:** API Member 3 tasks need to be distributed among S225158107, 223856998, or 224142778 if needed. + +--- + +## CLOUD TEAM (3 Members) + +### **Cloud Member 1 - Infrastructure Lead (220618261 - HD Target)** +**Sprint 1:** +- [ ] **[HIGH]** Set up GKE cluster with node pools and networking +- [ ] **[HIGH]** Configure VPC, firewall rules, and persistent storage +- [ ] **[MEDIUM]** Deploy MongoDB StatefulSet with 1000+ detections + +**Sprint 2:** +- [ ] **[MEDIUM]** Set up Cloud Monitoring dashboards and uptime checks +- [ ] **[NORMAL]** Configure auto-scaling policies for Engine pods +- [ ] **[NORMAL]** Set up CI/CD pipeline with Cloud Build + +--- + +### **Cloud Member 2 - Deployment Lead (S224097689 - HD Target)** +**Sprint 1:** +- [ ] **[HIGH]** Push Docker images to Google Container Registry +- [ ] **[HIGH]** Deploy all 6 services to GKE using K8s configs +- [ ] **[HIGH]** Configure LoadBalancer and obtain external IPs + +**Sprint 2:** +- [ ] **[HIGH]** Configure public domain and Cloud DNS +- [ ] **[HIGH]** Set up SSL certificates and HTTPS +- [ ] **[NORMAL]** Implement staging environment for testing + +--- + +### **Cloud Member 3 - Billing Support (224142778 - Pass Target)** +**Sprint 1:** +- [ ] **[MEDIUM]** Initialize database with sample data +- [ ] **[NORMAL]** Configure MongoDB backups to Cloud Storage +- [ ] **[NORMAL]** Document deployment procedures + +**Sprint 2:** +- [ ] **[HIGH]** Enable GCP Billing API and create service account +- [ ] **[HIGH]** Set up budgets and alert thresholds (50%, 80%, 100%) +- [ ] **[MEDIUM]** Write BigQuery SQL queries for cost analytics + +--- + +## API TEAM (3 Members) + +### **API Member 1 - Backend Lead (S225158107 - D Target)** +**Sprint 1:** +- [ ] **[HIGH]** Update Engine to run 24/7 with continuous processing +- [ ] **[HIGH]** Implement WebSocket/SSE for real-time detections +- [ ] **[MEDIUM]** Deploy IoT simulator as 24/7 Kubernetes job + +**Sprint 2:** +- [ ] **[HIGH]** Create admin cost API endpoints (/admin/costs/*) +- [ ] **[HIGH]** Implement JWT authentication for admin access +- [ ] **[MEDIUM]** Add user roles and permission checking + +--- + +### **API Member 2 - Frontend Lead (223856998 - Junior, D Target)** +**Sprint 1:** +- [ ] **[HIGH]** Build live detection map with real-time updates +- [ ] **[HIGH]** Display species markers with confidence >70% +- [ ] **[MEDIUM]** Show detection details (species, location, timestamp) + +**Sprint 2:** +- [ ] **[HIGH]** Create admin cost dashboard HTML page +- [ ] **[MEDIUM]** Build interactive charts (pie, line, gauge) +- [ ] **[NORMAL]** Add date range picker and CSV export + +--- + +### **UNASSIGNED TASKS (Can be distributed)** +**Sprint 1:** +- [ ] **[HIGH]** Create detection storage/retrieval API endpoints (Suggest: S225158107) +- [ ] **[MEDIUM]** Implement pagination and filtering (Suggest: 223856998) +- [ ] **[NORMAL]** Optimize MongoDB indexes (Suggest: 224142778) + +**Sprint 2:** +- [ ] **[MEDIUM]** Add admin budget control forms (Suggest: 223856998) +- [ ] **[NORMAL]** Implement "Pause Services" functionality (Suggest: 224142778) +- [ ] **[NORMAL]** Load test API for 100+ concurrent users (Suggest: S225158107) +- [ ] **[NORMAL]** Write admin documentation and demo video (Suggest: 224142778) + +--- + +## DELIVERABLES + +### **Sprint 1 End (Week 2):** +- ✅ Live system deployed on GCP +- ✅ Public URL showing real-time wildlife detections +- ✅ Map updating every 1-5 minutes with new detections +- ✅ MongoDB with 1000+ detection records + +### **Sprint 2 End (Week 4):** +- ✅ 24/7 monitoring with alert notifications +- ✅ HTTPS domain with SSL certificate +- ✅ Admin cost dashboard showing GCP spending +- ✅ Budget alerts configured and tested +- ✅ API documentation and demo videos + +--- + +## DAILY STANDUP QUESTIONS +1. What did I complete yesterday? +2. What am I working on today? +3. Any blockers? + +## SPRINT REVIEW +- **Sprint 1:** Demo live detection system +- **Sprint 2:** Demo admin cost dashboard + +--- + +**Questions?** Contact project lead. diff --git a/STATIC_IP_FIX.md b/STATIC_IP_FIX.md new file mode 100644 index 000000000..22e600c3e --- /dev/null +++ b/STATIC_IP_FIX.md @@ -0,0 +1,153 @@ +# Fix Dynamic IP Issue - Reserve Static IP for GKE + +## Problem +Your external IP keeps changing because you're using an ephemeral (dynamic) IP address. +- Old IP: 34.151.181.137 (no longer works) +- Current IP: 34.151.179.104 (will change if service restarts) + +## Solution: Reserve a Static IP + +### Step 1: Reserve a Static IP Address +```powershell +# Reserve a static external IP in your region +gcloud compute addresses create project-echo-static-ip ` + --region=australia-southeast1 ` + --network-tier=PREMIUM + +# Get the reserved IP address +gcloud compute addresses describe project-echo-static-ip ` + --region=australia-southeast1 ` + --format="get(address)" +``` + +### Step 2: Update Your Kubernetes Service + +Edit your HMI service to use the static IP: + +**Option A: Using kubectl (if plugin is installed)** +```powershell +kubectl annotate service hmi-service ` + "cloud.google.com/load-balancer-type=external" ` + --overwrite ` + -n default + +kubectl patch service hmi-service ` + -p '{"spec":{"loadBalancerIP":"YOUR_STATIC_IP_HERE"}}' ` + -n default +``` + +**Option B: Update the YAML file directly** + +Edit [k8s/base/hmi-service.yaml](k8s/base/hmi-service.yaml): + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: hmi-service + namespace: default +spec: + type: LoadBalancer + loadBalancerIP: "34.151.XXX.XXX" # Add your static IP here + ports: + - name: http + port: 8080 + targetPort: 3000 + protocol: TCP + selector: + app: hmi +``` + +Then reapply: +```powershell +kubectl apply -f k8s/base/hmi-service.yaml +``` + +### Step 3: Verify Static IP is Assigned +```powershell +# Check service external IP +gcloud compute forwarding-rules list + +# Or using kubectl +kubectl get service hmi-service -n default +``` + +## Alternative: Use a Domain Name + +Instead of relying on IP addresses, set up a domain name: + +### Option 1: Use Google Cloud DNS +```powershell +# Create a DNS zone +gcloud dns managed-zones create project-echo ` + --dns-name="yourdomain.com." ` + --description="Project Echo DNS" + +# Add A record pointing to your static IP +gcloud dns record-sets create login.yourdomain.com. ` + --zone="project-echo" ` + --type="A" ` + --ttl="300" ` + --rrdatas="34.151.179.104" +``` + +### Option 2: Update Ingress (Recommended for Production) + +If using Ingress (which you have configured), you can: + +1. Reserve a static IP for Ingress: +```powershell +gcloud compute addresses create project-echo-ingress-ip ` + --global ` + --ip-version=IPV4 +``` + +2. Update [k8s/base/ingress.yaml](k8s/base/ingress.yaml): +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: project-echo + namespace: project-echo + annotations: + kubernetes.io/ingress.class: "gce" + kubernetes.io/ingress.global-static-ip-name: "project-echo-ingress-ip" + networking.gke.io/managed-certificates: project-echo-cert +spec: + rules: + - host: echo.yourdomain.com # Use your domain + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hmi + port: + name: http +``` + +## Quick Command Reference + +```powershell +# List all static IPs +gcloud compute addresses list + +# Delete old ephemeral forwarding rule (if needed) +gcloud compute forwarding-rules delete a36876cb6efa64644875403f385a02b4 --region=australia-southeast1 + +# Check current service details +kubectl get service hmi-service -n default -o yaml +``` + +## Cost Note +Static IP addresses in GCP are: +- **FREE** when attached to a running resource (like your load balancer) +- **~$3/month** when reserved but not in use +- Much cheaper than dealing with IP changes! + +## Recommended Next Steps +1. Reserve a static IP immediately (5 minutes) +2. Update your service to use it (10 minutes) +3. Consider setting up a proper domain name for production use +4. Update your documentation with the static IP address diff --git a/app/main.py b/app/main.py index 522655760..5d181f4c0 100644 --- a/app/main.py +++ b/app/main.py @@ -3,6 +3,7 @@ from pydantic import BaseModel import asyncio, os, time from .model import DummyModel +from google.cloud import compute_v1 app = FastAPI(title="EchoNet Model API", version="0.1.0") @@ -43,3 +44,28 @@ async def predict(file: UploadFile = File(...)): label = await loop.run_in_executor(None, _model.predict, data) latency_ms = int((time.perf_counter() - start) * 1000) return {"label": label, "latency_ms": latency_ms} + +@app.get("/cloud-info") +async def get_cloud_info(): + try: + # Initialize Google Cloud client lazily + client = compute_v1.InstancesClient() + + # Replace 'your-project-id' and 'your-zone' with actual values + project = "your-project-id" + zone = "your-zone" + + # Fetch instance details + instances = client.list(project=project, zone=zone) + instance_list = [ + { + "name": instance.name, + "status": instance.status, + "machine_type": instance.machine_type, + } + for instance in instances + ] + + return {"instances": instance_list} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error fetching cloud info: {str(e)}") diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 000000000..1b98a9dd5 --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,77 @@ +options: + logging: CLOUD_LOGGING_ONLY + substitutionOption: ALLOW_LOOSE + +substitutions: + _ENV: dev + _REGION: australia-southeast1 + _REPOSITORY: echonet + _CLUSTER: echonet-gke + _DEPLOY_OVERLAY: k8s/overlays/${_ENV} + +steps: + - id: "build-api" + name: gcr.io/cloud-builders/docker + dir: src/Echo_Components_on_K8s/api + args: + - build + - "-t" + - "${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${SHORT_SHA}" + - . + + - id: "build-engine" + name: gcr.io/cloud-builders/docker + args: + - build + - "-t" + - "${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${SHORT_SHA}" + - . + + - id: "build-hmi" + name: gcr.io/cloud-builders/docker + dir: src/Echo_Components_on_K8s/frontend + args: + - build + - "-t" + - "${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${SHORT_SHA}" + - . + + - id: "push-images" + name: gcr.io/cloud-builders/docker + entrypoint: bash + args: + - -c + - | + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${SHORT_SHA} + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${SHORT_SHA} + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${SHORT_SHA} + + - id: "tag-release" + name: gcr.io/cloud-builders/docker + entrypoint: bash + args: + - -c + - | + docker tag ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${SHORT_SHA} ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${_ENV} + docker tag ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${SHORT_SHA} ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${_ENV} + docker tag ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${SHORT_SHA} ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${_ENV} + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${_ENV} + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${_ENV} + docker push ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${_ENV} + + - id: "deploy" + name: gcr.io/cloud-builders/gcloud + entrypoint: bash + env: + - CLOUDSDK_COMPUTE_REGION=${_REGION} + - CLOUDSDK_CONTAINER_CLUSTER=${_CLUSTER} + args: + - -c + - | + gcloud container clusters get-credentials ${_CLUSTER} --region ${_REGION} + kubectl apply -k ${_DEPLOY_OVERLAY} + +images: + - ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/api:${SHORT_SHA} + - ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/engine:${SHORT_SHA} + - ${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPOSITORY}/hmi:${SHORT_SHA} diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 000000000..09be5b102 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,33 @@ +# Cloud Deployment Guide + +This folder contains the infrastructure-as-code used to run Project Echo on Google Cloud Platform. + +## Terraform (`infra/terraform`) + +1. Copy `terraform.tfvars.example` to `terraform.tfvars` and update the values for your project (project id, regions, bucket names, Workload Identity bindings, etc.). +2. (Optional) Configure remote state in `backend.tf` if you do not want to use the default local state. +3. Initialise and review the plan: + ```bash + cd infra/terraform + terraform init + terraform plan + ``` +4. Apply when satisfied: + ```bash + terraform apply -auto-approve + ``` + +The Terraform stack enables the required APIs, provisions a private GKE cluster, creates service accounts + Workload Identity bindings, Artifact Registry, Secret Manager entries, Cloud NAT, and model storage buckets. + +## Kubernetes (`k8s/`) + +`k8s/base` defines the shared manifests for the API, inference engine, HMI, MongoDB, and Redis components. The `k8s/overlays/dev` and `k8s/overlays/prod` folders provide environment-specific adjustments. Update the image registries, service-account annotations, ingress hosts, and secret literals before deploying. + +Render and apply an overlay: +```bash +kubectl apply -k k8s/overlays/dev +``` + +## Cloud Build (`cloudbuild.yaml`) + +`cloudbuild.yaml` builds the API, engine, and HMI containers, pushes them to Artifact Registry, and deploys the selected overlay. Adjust `_ENV`, `_REGION`, `_REPOSITORY`, and `_CLUSTER` substitutions to match your environment or override them when triggering the build. diff --git a/infra/terraform/.terraform.lock.hcl b/infra/terraform/.terraform.lock.hcl new file mode 100644 index 000000000..8a3040794 --- /dev/null +++ b/infra/terraform/.terraform.lock.hcl @@ -0,0 +1,42 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "5.45.2" + constraints = "~> 5.30" + hashes = [ + "h1:iy2Q9VcnMu4z/bH3v/NmI/nEpgYY7bXgJmT/hVTAUS4=", + "zh:0d09c8f20b556305192cdbe0efa6d333ceebba963a8ba91f9f1714b5a20c4b7a", + "zh:117143fc91be407874568df416b938a6896f94cb873f26bba279cedab646a804", + "zh:16ccf77d18dd2c5ef9c0625f9cf546ebdf3213c0a452f432204c69feed55081e", + "zh:3e555cf22a570a4bd247964671f421ed7517970cd9765ceb46f335edc2c6f392", + "zh:688bd5b05a75124da7ae6e885b2b92bd29f4261808b2b78bd5f51f525c1052ca", + "zh:6db3ef37a05010d82900bfffb3261c59a0c247e0692049cb3eb8c2ef16c9d7bf", + "zh:70316fde75f6a15d72749f66d994ccbdde5f5ed4311b6d06b99850f698c9bbf9", + "zh:84b8e583771a4f2bd514e519d98ed7fd28dce5efe0634e973170e1cfb5556fb4", + "zh:9d4b8ef0a9b6677935c604d94495042e68ff5489932cfd1ec41052e094a279d3", + "zh:a2089dd9bd825c107b148dd12d6b286f71aa37dfd4ca9c35157f2dcba7bc19d8", + "zh:f03d795c0fd9721e59839255ee7ba7414173017dc530b4ce566daf3802a0d6dd", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/google-beta" { + version = "5.45.2" + constraints = "~> 5.30" + hashes = [ + "h1:ME/cVZGNln4h166gyo9r7CuunzZ3FEqlIaNyQ0e9yjE=", + "zh:16b77bac5d1555b7f066ba8014f4fc8a6d0de64e252a1988d3fbb400984a4b19", + "zh:1b13f515c4809343840aed8265915cc4191f138bdab5a8c5e1f542fdfc69989f", + "zh:1dcce4309aeab7c88fd36aea664d57e620d8a413b967ce513a5a866e8de901f2", + "zh:24db65d7929f2a731e9cac1750c569cb4528b312ef182a5e2e8c0cf008d8a71b", + "zh:28c0b9e68d97570f03b2c4770607701580055bcba50069efd145954aa13b23e4", + "zh:3a898a1ad1569f6486a2bc20014087284c8cab919bc8f155833de5128ccd12eb", + "zh:4eed99cfb9daada70f813f2cedcf490d3097de1ccb9b391fc451ecc46509c067", + "zh:888c4cb1f13b23674ba1091835dd3f1bff5d8e7729ef302183d8d01233819e54", + "zh:8baae3b949f6e9505425f5fa4785de786e9cedc4c3f3ad906d8ed560bd2e39c6", + "zh:cf2c8928b764592fa2cd14a9f109d01cd0a92049a4fca9d0a74cf2fe588364e2", + "zh:edff09394f5bd0b278a4adc800a31b7f150249a1ea92ca273ccf4acd25be3f63", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/infra/terraform/main.tf b/infra/terraform/main.tf new file mode 100644 index 000000000..92c2e7ab3 --- /dev/null +++ b/infra/terraform/main.tf @@ -0,0 +1,120 @@ +locals { + artifact_repo_location = coalesce(var.artifact_repo_location, var.region) + + service_account_role_pairs = flatten([ + for sa_key, sa_def in var.workload_service_accounts : [ + for role in sa_def.roles : { + key = "${sa_key}-${replace(role, "/", "-")}" + sa_key = sa_key + role = role + } + ] + ]) + + bucket_config = { + for env, bucket in var.model_buckets : env => { + name = replace(bucket.name, "PROJECT_ID", var.project_id) + location = bucket.location + retention_days = bucket.retention_days + storage_class = try(bucket.storage_class, "STANDARD") + labels = merge({ env = env, app = "echonet" }, var.default_labels, try(bucket.labels, {})) + force_destroy = try(bucket.force_destroy, false) + } + } +} + +module "network" { + # Required GCP services (compute, container, servicenetworking, etc.) must be pre-enabled by an owner. + source = "./modules/network" + project_id = var.project_id + network_name = var.network_name + subnet_name = var.subnet_name + region = var.region + primary_cidr = var.subnet_ip_cidr + pod_cidr = var.pods_secondary_cidr + service_cidr = var.services_secondary_cidr +} + +module "gke" { + source = "./modules/gke" + providers = { google-beta = google-beta } + project_id = var.project_id + cluster_name = var.cluster_name + region = var.region + zones = var.zones + network = module.network.network_self_link + subnetwork = module.network.subnet_self_link + subnet_name = module.network.subnet_name + pod_secondary_range = module.network.pod_secondary_range + service_secondary_range = module.network.service_secondary_range + workload_identity_pool = "${var.project_id}.svc.id.goog" + master_ipv4_cidr = var.master_ipv4_cidr_block + master_authorized_cidrs = var.master_authorized_cidrs + release_channel = var.cluster_release_channel + logging_components = var.cluster_logging_components + monitoring_components = var.cluster_monitoring_components + node_pools = var.node_pools + depends_on = [module.network] +} + +resource "google_artifact_registry_repository" "primary" { + project = var.project_id + location = local.artifact_repo_location + repository_id = var.artifact_repo_name + format = "DOCKER" + description = "Container images for Project Echo workloads" + labels = merge({ app = "echonet" }, var.default_labels) +} + +resource "google_storage_bucket" "models" { + for_each = local.bucket_config + name = each.value.name + location = each.value.location + storage_class = each.value.storage_class + force_destroy = each.value.force_destroy + uniform_bucket_level_access = true + versioning { enabled = true } + lifecycle_rule { + action { type = "Delete" } + condition { age = each.value.retention_days } + } + labels = each.value.labels +} + +data "google_service_account" "workloads" { + for_each = var.workload_service_accounts + project = var.project_id + # Service accounts must be pre-created by a project owner using this naming pattern. + account_id = substr( + lower(replace("${var.environment}-${each.key}", "_", "-")), + 0, + 30 + ) +} + +resource "google_project_iam_member" "workload_roles" { + for_each = { for item in local.service_account_role_pairs : item.key => item } + project = var.project_id + role = each.value.role + member = "serviceAccount:${data.google_service_account.workloads[each.value.sa_key].email}" +} + +resource "google_service_account_iam_member" "workload_identity" { + for_each = { for binding in var.workload_identity_bindings : "${binding.service_account}-${binding.namespace}-${binding.ksa}" => binding } + service_account_id = data.google_service_account.workloads[each.value.service_account].name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.project_id}.svc.id.goog[${each.value.namespace}/${each.value.ksa}]" +} + +resource "google_secret_manager_secret" "managed" { + for_each = toset(var.secret_names) + secret_id = each.value + replication { + user_managed { + replicas { + location = var.region + } + } + } + labels = merge({ app = "echonet" }, var.default_labels) +} diff --git a/infra/terraform/modules/gke/main.tf b/infra/terraform/modules/gke/main.tf new file mode 100644 index 000000000..930cdfbeb --- /dev/null +++ b/infra/terraform/modules/gke/main.tf @@ -0,0 +1,132 @@ +locals { + node_pool_map = { for pool in var.node_pools : pool.name => pool } +} + +resource "google_container_cluster" "this" { + provider = google-beta + name = var.cluster_name + project = var.project_id + location = var.region + + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + network = var.network + subnetwork = var.subnetwork + node_locations = length(var.zones) > 0 ? var.zones : null + + workload_identity_config { + workload_pool = var.workload_identity_pool + } + + ip_allocation_policy { + cluster_secondary_range_name = var.pod_secondary_range + services_secondary_range_name = var.service_secondary_range + } + + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + master_ipv4_cidr_block = var.master_ipv4_cidr + } + + dynamic "master_authorized_networks_config" { + for_each = length(var.master_authorized_cidrs) > 0 ? [true] : [] + content { + dynamic "cidr_blocks" { + for_each = var.master_authorized_cidrs + content { + cidr_block = cidr_blocks.value.cidr_block + display_name = try(cidr_blocks.value.description, "") + } + } + } + } + + release_channel { + channel = var.release_channel + } + + vertical_pod_autoscaling { + enabled = true + } + + logging_config { + enable_components = var.logging_components + } + + monitoring_config { + enable_components = var.monitoring_components + } + + addons_config { + http_load_balancing { + disabled = false + } + horizontal_pod_autoscaling { + disabled = false + } + } + + cluster_autoscaling { + autoscaling_profile = "OPTIMIZE_UTILIZATION" + } +} + +resource "google_container_node_pool" "this" { + provider = google-beta + for_each = local.node_pool_map + + name = each.value.name + project = var.project_id + cluster = google_container_cluster.this.name + location = var.region + + node_config { + machine_type = each.value.machine_type + disk_size_gb = try(each.value.disk_size_gb, 100) + disk_type = try(each.value.disk_type, "pd-balanced") + service_account = try(each.value.service_account, null) + spot = try(each.value.spot, false) + labels = merge({ pool = each.value.name }, try(each.value.labels, {})) + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + + workload_metadata_config { + mode = "GKE_METADATA" + } + + shielded_instance_config { + enable_secure_boot = true + enable_integrity_monitoring = true + } + + dynamic "taint" { + for_each = try(each.value.taints, []) + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } + + dynamic "guest_accelerator" { + for_each = try(each.value.gpu != null ? [each.value.gpu] : [], []) + content { + type = guest_accelerator.value.type + count = guest_accelerator.value.count + } + } + } + + autoscaling { + min_node_count = each.value.min_count + max_node_count = each.value.max_count + } + + management { + auto_repair = true + auto_upgrade = true + } + + initial_node_count = max(each.value.min_count, 1) +} diff --git a/infra/terraform/modules/gke/outputs.tf b/infra/terraform/modules/gke/outputs.tf new file mode 100644 index 000000000..f898dcc1d --- /dev/null +++ b/infra/terraform/modules/gke/outputs.tf @@ -0,0 +1,14 @@ +output "cluster_name" { + value = google_container_cluster.this.name + description = "Name of the created cluster" +} + +output "endpoint" { + value = google_container_cluster.this.endpoint + description = "Endpoint of the GKE API server" +} + +output "node_pool_ids" { + value = [for pool in google_container_node_pool.this : pool.id] + description = "Identifiers of the managed node pools" +} diff --git a/infra/terraform/modules/gke/variables.tf b/infra/terraform/modules/gke/variables.tf new file mode 100644 index 000000000..1fe42daf2 --- /dev/null +++ b/infra/terraform/modules/gke/variables.tf @@ -0,0 +1,97 @@ +variable "project_id" { + type = string + description = "Google Cloud project identifier" +} + +variable "cluster_name" { + type = string + description = "Name of the GKE cluster" +} + +variable "region" { + type = string + description = "Region for the regional GKE cluster" +} + +variable "zones" { + type = list(string) + default = [] + description = "Optional list of zones for node pool placement" +} + +variable "network" { + type = string + description = "Self link of the VPC network" +} + +variable "subnetwork" { + type = string + description = "Self link of the subnetwork" +} + +variable "subnet_name" { + type = string + description = "Name of the subnetwork" +} + +variable "pod_secondary_range" { + type = string + description = "Secondary IP range name for pods" +} + +variable "service_secondary_range" { + type = string + description = "Secondary IP range name for services" +} + +variable "workload_identity_pool" { + type = string + description = "Workload Identity pool name" +} + +variable "master_ipv4_cidr" { + type = string + description = "Control plane CIDR block" +} + +variable "master_authorized_cidrs" { + type = list(object({ + cidr_block = string + description = optional(string, "") + })) + default = [] + description = "CIDR ranges allowed to reach the control plane" +} + +variable "release_channel" { + type = string + description = "Desired GKE release channel" +} + +variable "logging_components" { + type = list(string) + description = "Logging components to enable" +} + +variable "monitoring_components" { + type = list(string) + description = "Monitoring components to enable" +} + +variable "node_pools" { + type = list(object({ + name = string + machine_type = string + min_count = number + max_count = number + disk_size_gb = optional(number, 100) + disk_type = optional(string, "pd-balanced") + service_account = optional(string) + spot = optional(bool, false) + labels = optional(map(string), {}) + taints = optional(list(object({ key = string, value = string, effect = string })), []) + gpu = optional(object({ type = string, count = number })) + })) + description = "Node pool definitions" +} + diff --git a/infra/terraform/modules/gke/versions.tf b/infra/terraform/modules/gke/versions.tf new file mode 100644 index 000000000..4deafbc31 --- /dev/null +++ b/infra/terraform/modules/gke/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + } + google-beta = { + source = "hashicorp/google-beta" + } + } +} diff --git a/infra/terraform/modules/network/main.tf b/infra/terraform/modules/network/main.tf new file mode 100644 index 000000000..f3448c833 --- /dev/null +++ b/infra/terraform/modules/network/main.tf @@ -0,0 +1,53 @@ +locals { + pod_range_name = "${var.subnet_name}-pods" + service_range_name = "${var.subnet_name}-services" + router_name = "${var.network_name}-router" + nat_name = "${var.network_name}-nat" +} + +resource "google_compute_network" "this" { + name = var.network_name + project = var.project_id + auto_create_subnetworks = false + routing_mode = "REGIONAL" +} + +resource "google_compute_subnetwork" "primary" { + name = var.subnet_name + project = var.project_id + region = var.region + network = google_compute_network.this.id + ip_cidr_range = var.primary_cidr + stack_type = "IPV4_ONLY" + + secondary_ip_range { + range_name = local.pod_range_name + ip_cidr_range = var.pod_cidr + } + + secondary_ip_range { + range_name = local.service_range_name + ip_cidr_range = var.service_cidr + } +} + +resource "google_compute_router" "this" { + name = local.router_name + project = var.project_id + region = var.region + network = google_compute_network.this.id +} + +resource "google_compute_router_nat" "this" { + name = local.nat_name + project = var.project_id + region = var.region + router = google_compute_router.this.name + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" + + log_config { + filter = "ERRORS_ONLY" + enable = true + } +} diff --git a/infra/terraform/modules/network/outputs.tf b/infra/terraform/modules/network/outputs.tf new file mode 100644 index 000000000..05a3f4891 --- /dev/null +++ b/infra/terraform/modules/network/outputs.tf @@ -0,0 +1,29 @@ +output "network_name" { + value = google_compute_network.this.name + description = "Name of the created VPC network" +} + +output "network_self_link" { + value = google_compute_network.this.self_link + description = "Self link of the VPC network" +} + +output "subnet_name" { + value = google_compute_subnetwork.primary.name + description = "Name of the primary subnetwork" +} + +output "subnet_self_link" { + value = google_compute_subnetwork.primary.self_link + description = "Self link of the primary subnetwork" +} + +output "pod_secondary_range" { + value = local.pod_range_name + description = "Name of the secondary IP range allocated for pods" +} + +output "service_secondary_range" { + value = local.service_range_name + description = "Name of the secondary IP range allocated for services" +} diff --git a/infra/terraform/modules/network/variables.tf b/infra/terraform/modules/network/variables.tf new file mode 100644 index 000000000..c424f4fb0 --- /dev/null +++ b/infra/terraform/modules/network/variables.tf @@ -0,0 +1,34 @@ +variable "project_id" { + type = string + description = "Google Cloud project identifier" +} + +variable "network_name" { + type = string + description = "VPC network name" +} + +variable "subnet_name" { + type = string + description = "Primary subnetwork name" +} + +variable "region" { + type = string + description = "Region for subnetwork and router" +} + +variable "primary_cidr" { + type = string + description = "CIDR block for the primary subnetwork" +} + +variable "pod_cidr" { + type = string + description = "Secondary CIDR block for pod IPs" +} + +variable "service_cidr" { + type = string + description = "Secondary CIDR block for service IPs" +} diff --git a/infra/terraform/modules/network/versions.tf b/infra/terraform/modules/network/versions.tf new file mode 100644 index 000000000..b3340e113 --- /dev/null +++ b/infra/terraform/modules/network/versions.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + } + } +} diff --git a/infra/terraform/outputs.tf b/infra/terraform/outputs.tf new file mode 100644 index 000000000..fa0f8f85e --- /dev/null +++ b/infra/terraform/outputs.tf @@ -0,0 +1,24 @@ +output "cluster_name" { + value = module.gke.cluster_name + description = "Name of the managed GKE cluster" +} + +output "cluster_endpoint" { + value = module.gke.endpoint + description = "Public endpoint for the GKE control plane" +} + +output "network_name" { + value = module.network.network_name + description = "Name of the provisioned VPC network" +} + +output "artifact_registry_repository" { + value = google_artifact_registry_repository.primary.repository_id + description = "Artifact Registry repository identifier" +} + +output "workload_service_accounts" { + value = { for key, sa in data.google_service_account.workloads : key => sa.email } + description = "Map of workload service account keys to emails" +} diff --git a/infra/terraform/provider.tf b/infra/terraform/provider.tf new file mode 100644 index 000000000..3377ee9ec --- /dev/null +++ b/infra/terraform/provider.tf @@ -0,0 +1,27 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.30" + } + google-beta = { + source = "hashicorp/google-beta" + version = "~> 5.30" + } + } +} + +provider "google" { + project = var.project_id + region = var.region + user_project_override = true + # Provide credentials via GOOGLE_APPLICATION_CREDENTIALS or GOOGLE_CLOUD_KEYFILE_JSON env vars. +} + +provider "google-beta" { + project = var.project_id + region = var.region + user_project_override = true + # Provide credentials via GOOGLE_APPLICATION_CREDENTIALS or GOOGLE_CLOUD_KEYFILE_JSON env vars. +} diff --git a/infra/terraform/terraform.tfvars.example b/infra/terraform/terraform.tfvars.example new file mode 100644 index 000000000..31eb077de --- /dev/null +++ b/infra/terraform/terraform.tfvars.example @@ -0,0 +1,50 @@ +project_id = "sit-23t1-project-echo-25288b9" +region = "australia-southeast2" +zones = ["australia-southeast2-a", "australia-southeast2-b"] +environment = "dev" +cluster_name = "echonet-gke" +network_name = "echonet-network" +subnet_name = "echonet-subnet" +subnet_ip_cidr = "10.30.0.0/20" +pods_secondary_cidr = "10.40.0.0/16" +services_secondary_cidr = "10.50.0.0/20" +master_ipv4_cidr_block = "172.16.0.0/28" +master_authorized_cidrs = [{ cidr_block = "0.0.0.0/0", description = "temporary-open" }] +artifact_repo_name = "echonet" +artifact_repo_location = "australia-southeast2" + +model_buckets = { + stg = { + name = "PROJECT_ID-echonet-models-stg" + location = "australia-southeast2" + retention_days = 120 + } + prod = { + name = "PROJECT_ID-echonet-models-prod" + location = "australia-southeast2" + retention_days = 365 + } +} + +workload_service_accounts = { + api = { + display_name = "API Workload" + roles = ["roles/artifactregistry.reader", "roles/secretmanager.secretAccessor", "roles/storage.objectViewer"] + } + engine = { + display_name = "Engine Workload" + roles = ["roles/secretmanager.secretAccessor", "roles/storage.objectViewer"] + } + hmi = { + display_name = "HMI Workload" + roles = ["roles/secretmanager.secretAccessor"] + } +} + +workload_identity_bindings = [ + { service_account = "api", namespace = "project-echo", ksa = "api" }, + { service_account = "engine", namespace = "project-echo", ksa = "engine" }, + { service_account = "hmi", namespace = "project-echo", ksa = "hmi" } +] + +secret_names = ["mongo-uri", "redis-password", "twilio-auth-token"] diff --git a/infra/terraform/tfplan b/infra/terraform/tfplan new file mode 100644 index 000000000..964c7eab9 Binary files /dev/null and b/infra/terraform/tfplan differ diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf new file mode 100644 index 000000000..9e402ece0 --- /dev/null +++ b/infra/terraform/variables.tf @@ -0,0 +1,226 @@ +variable "project_id" { + type = string + description = "Google Cloud project identifier" +} + +variable "region" { + type = string + description = "Primary deployment region" +} + +variable "zones" { + type = list(string) + default = [] + description = "Optional list of GCP zones for node placement" +} + +variable "environment" { + type = string + default = "dev" + description = "Environment name used for resource naming" +} + +variable "network_name" { + type = string + default = "echonet-network" + description = "VPC network name" +} + +variable "subnet_name" { + type = string + default = "echonet-subnet" + description = "Primary subnetwork name" +} + +variable "subnet_ip_cidr" { + type = string + default = "10.30.0.0/20" + description = "CIDR for the primary subnetwork" +} + +variable "pods_secondary_cidr" { + type = string + default = "10.40.0.0/16" + description = "CIDR range allocated for GKE pods" +} + +variable "services_secondary_cidr" { + type = string + default = "10.50.0.0/20" + description = "CIDR range allocated for GKE services" +} + +variable "cluster_name" { + type = string + default = "echonet-gke" + description = "Name of the GKE cluster" +} + +variable "master_ipv4_cidr_block" { + type = string + default = "172.16.0.0/28" + description = "Control plane CIDR block for private GKE clusters" +} + +variable "master_authorized_cidrs" { + type = list(object({ + cidr_block = string + description = optional(string, "") + })) + default = [] + description = "CIDR blocks allowed to access the GKE control plane" +} + +variable "cluster_release_channel" { + type = string + default = "REGULAR" + description = "GKE release channel" +} + +variable "cluster_logging_components" { + type = list(string) + default = ["SYSTEM_COMPONENTS", "WORKLOADS"] + description = "GKE logging components to enable" +} + +variable "cluster_monitoring_components" { + type = list(string) + default = ["SYSTEM_COMPONENTS", "POD"] + description = "GKE monitoring components to enable" +} + +variable "node_pools" { + description = "Node pool definitions for the GKE cluster" + type = list(object({ + name = string + machine_type = string + min_count = number + max_count = number + disk_size_gb = optional(number, 100) + disk_type = optional(string, "pd-balanced") + service_account = optional(string) + spot = optional(bool, false) + labels = optional(map(string), {}) + taints = optional(list(object({ key = string, value = string, effect = string })), []) + gpu = optional(object({ type = string, count = number })) + })) + default = [ + { + name = "general" + machine_type = "e2-standard-4" + min_count = 1 + max_count = 4 + labels = { role = "general" } + }, + { + name = "gpu" + machine_type = "n1-standard-4" + min_count = 0 + max_count = 2 + labels = { role = "gpu" } + taints = [{ key = "gpu", value = "true", effect = "NO_SCHEDULE" }] + gpu = { type = "nvidia-tesla-t4", count = 1 } # Guardrails restrict to N1 + Tesla T4, max 2 GPUs/project + } + ] +} + +variable "artifact_repo_name" { + type = string + default = "echonet" + description = "Artifact Registry repository name" +} + +variable "artifact_repo_location" { + type = string + default = null + description = "Location for Artifact Registry (defaults to region)" +} + +variable "model_buckets" { + description = "Model storage buckets keyed by environment" + type = map(object({ + name = string + location = string + retention_days = number + storage_class = optional(string, "STANDARD") + force_destroy = optional(bool, false) + labels = optional(map(string), {}) + })) + default = {} +} + +variable "workload_service_accounts" { + description = "Workload service account definitions" + type = map(object({ + display_name = string + description = optional(string) + roles = list(string) + })) + default = { + api = { + display_name = "API Workload" + roles = [ + "roles/artifactregistry.reader", + "roles/secretmanager.secretAccessor", + "roles/storage.objectViewer" + ] + } + engine = { + display_name = "Engine Workload" + roles = [ + "roles/secretmanager.secretAccessor", + "roles/storage.objectViewer" + ] + } + hmi = { + display_name = "HMI Workload" + roles = [ + "roles/secretmanager.secretAccessor" + ] + } + } +} + +variable "workload_identity_bindings" { + description = "Mappings between Google and Kubernetes service accounts" + type = list(object({ + service_account = string + namespace = string + ksa = string + })) + default = [ + { + service_account = "api" + namespace = "project-echo" + ksa = "api" + }, + { + service_account = "engine" + namespace = "project-echo" + ksa = "engine" + }, + { + service_account = "hmi" + namespace = "project-echo" + ksa = "hmi" + } + ] +} + +variable "secret_names" { + type = list(string) + default = ["mongo-uri", "redis-password", "twilio-auth-token"] + description = "Secret Manager secret identifiers to provision" +} + +variable "default_labels" { + type = map(string) + default = {} + description = "Additional labels applied to managed resources" +} + +variable "project_services_additional" { + type = list(string) + default = [] + description = "Additional APIs to enable on the project" +} diff --git a/k8s/base/api-deployment.yaml b/k8s/base/api-deployment.yaml new file mode 100644 index 000000000..c12c9f7e2 --- /dev/null +++ b/k8s/base/api-deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + namespace: project-echo + labels: + app.kubernetes.io/name: api + app.kubernetes.io/component: backend +spec: + replicas: 2 + revisionHistoryLimit: 3 + selector: + matchLabels: + app.kubernetes.io/name: api + template: + metadata: + labels: + app.kubernetes.io/name: api + app.kubernetes.io/component: backend + spec: + serviceAccountName: api + containers: + - name: api + image: REGION-docker.pkg.dev/PROJECT_ID/echonet/api:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + env: + - name: APP_ENV + valueFrom: + configMapKeyRef: + name: project-echo-config + key: APP_ENV + - name: LOG_LEVEL + valueFrom: + configMapKeyRef: + name: project-echo-config + key: LOG_LEVEL + - name: MONGO_URI + valueFrom: + secretKeyRef: + name: api-secrets + key: MONGO_URI + - name: REDIS_URL + valueFrom: + secretKeyRef: + name: api-secrets + key: REDIS_URL + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 30 + periodSeconds: 30 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/k8s/base/api-service.yaml b/k8s/base/api-service.yaml new file mode 100644 index 000000000..5f2bb9225 --- /dev/null +++ b/k8s/base/api-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + namespace: project-echo + labels: + app.kubernetes.io/name: api + app.kubernetes.io/component: backend +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: api + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http diff --git a/k8s/base/configmap-app.yaml b/k8s/base/configmap-app.yaml new file mode 100644 index 000000000..886361515 --- /dev/null +++ b/k8s/base/configmap-app.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: project-echo-config + namespace: project-echo + labels: + app.kubernetes.io/component: shared-config + app.kubernetes.io/part-of: project-echo + annotations: + config.kubernetes.io/channel: default +data: + APP_ENV: "dev" + API_BASE_PATH: "/api" + LOG_LEVEL: "INFO" diff --git a/k8s/base/engine-deployment.yaml b/k8s/base/engine-deployment.yaml new file mode 100644 index 000000000..d6250ab02 --- /dev/null +++ b/k8s/base/engine-deployment.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: engine + namespace: project-echo + labels: + app.kubernetes.io/name: engine + app.kubernetes.io/component: inference +spec: + replicas: 2 + revisionHistoryLimit: 3 + selector: + matchLabels: + app.kubernetes.io/name: engine + template: + metadata: + labels: + app.kubernetes.io/name: engine + app.kubernetes.io/component: inference + spec: + serviceAccountName: engine + containers: + - name: engine + image: REGION-docker.pkg.dev/PROJECT_ID/echonet/engine:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8000 + env: + - name: APP_ENV + valueFrom: + configMapKeyRef: + name: project-echo-config + key: APP_ENV + - name: MODEL_BUCKET + valueFrom: + secretKeyRef: + name: engine-secrets + key: MODEL_BUCKET + - name: GOOGLE_APPLICATION_CREDENTIALS + value: "/var/run/secrets/workload-identity/credentials.json" + volumeMounts: + - name: wi-credentials + mountPath: /var/run/secrets/workload-identity + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 20 + periodSeconds: 15 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 45 + periodSeconds: 30 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: "1" + memory: 2Gi + volumes: + - name: wi-credentials + projected: + sources: + - serviceAccountToken: + path: credentials.json + expirationSeconds: 3600 + audience: https://sts.googleapis.com diff --git a/k8s/base/engine-service.yaml b/k8s/base/engine-service.yaml new file mode 100644 index 000000000..a19794d04 --- /dev/null +++ b/k8s/base/engine-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: engine + namespace: project-echo + labels: + app.kubernetes.io/name: engine + app.kubernetes.io/component: inference +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: engine + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http diff --git a/k8s/base/hmi-deployment.yaml b/k8s/base/hmi-deployment.yaml new file mode 100644 index 000000000..3e6984105 --- /dev/null +++ b/k8s/base/hmi-deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: hmi + namespace: project-echo + labels: + app.kubernetes.io/name: hmi + app.kubernetes.io/component: frontend +spec: + replicas: 2 + revisionHistoryLimit: 3 + selector: + matchLabels: + app.kubernetes.io/name: hmi + template: + metadata: + labels: + app.kubernetes.io/name: hmi + app.kubernetes.io/component: frontend + spec: + serviceAccountName: hmi + containers: + - name: hmi + image: REGION-docker.pkg.dev/PROJECT_ID/echonet/hmi:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 3000 + env: + - name: API_BASE_URL + valueFrom: + configMapKeyRef: + name: project-echo-config + key: API_BASE_PATH + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 30 + periodSeconds: 30 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi diff --git a/k8s/base/hmi-service.yaml b/k8s/base/hmi-service.yaml new file mode 100644 index 000000000..87fc3c4b3 --- /dev/null +++ b/k8s/base/hmi-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: hmi + namespace: project-echo + labels: + app.kubernetes.io/name: hmi + app.kubernetes.io/component: frontend +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: hmi + ports: + - name: http + protocol: TCP + port: 80 + targetPort: http diff --git a/k8s/base/ingress.yaml b/k8s/base/ingress.yaml new file mode 100644 index 000000000..4956d915f --- /dev/null +++ b/k8s/base/ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: project-echo + namespace: project-echo + annotations: + kubernetes.io/ingress.class: "gce" + networking.gke.io/managed-certificates: project-echo-cert +spec: + rules: + - host: echo.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hmi + port: + name: http + - path: /api + pathType: Prefix + backend: + service: + name: api + port: + name: http diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml new file mode 100644 index 000000000..1c9adc395 --- /dev/null +++ b/k8s/base/kustomization.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: project-echo + +commonLabels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/part-of: project-echo + +resources: + - namespace.yaml + - serviceaccounts.yaml + - configmap-app.yaml + - secrets.yaml + - api-deployment.yaml + - api-service.yaml + - engine-deployment.yaml + - engine-service.yaml + - hmi-deployment.yaml + - hmi-service.yaml + - mongo-statefulset.yaml + - mongo-service.yaml + - redis-statefulset.yaml + - redis-service.yaml + - ingress.yaml + - networkpolicy.yaml diff --git a/k8s/base/mongo-service.yaml b/k8s/base/mongo-service.yaml new file mode 100644 index 000000000..e7ce0a471 --- /dev/null +++ b/k8s/base/mongo-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: mongo + namespace: project-echo + labels: + app.kubernetes.io/name: mongo + app.kubernetes.io/component: datastore +spec: + clusterIP: None + selector: + app.kubernetes.io/name: mongo + ports: + - name: mongo + port: 27017 + targetPort: mongo + protocol: TCP diff --git a/k8s/base/mongo-statefulset.yaml b/k8s/base/mongo-statefulset.yaml new file mode 100644 index 000000000..bad9dd71a --- /dev/null +++ b/k8s/base/mongo-statefulset.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongo + namespace: project-echo + labels: + app.kubernetes.io/name: mongo + app.kubernetes.io/component: datastore +spec: + serviceName: mongo + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: mongo + template: + metadata: + labels: + app.kubernetes.io/name: mongo + app.kubernetes.io/component: datastore + spec: + containers: + - name: mongo + image: mongo:6.0 + imagePullPolicy: IfNotPresent + ports: + - name: mongo + containerPort: 27017 + env: + - name: MONGO_INITDB_ROOT_USERNAME + valueFrom: + secretKeyRef: + name: mongo-secrets + key: ROOT_USERNAME + - name: MONGO_INITDB_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: mongo-secrets + key: ROOT_PASSWORD + volumeMounts: + - name: mongo-data + mountPath: /data/db + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + volumeClaimTemplates: + - metadata: + name: mongo-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + storageClassName: standard-rwo diff --git a/k8s/base/namespace.yaml b/k8s/base/namespace.yaml new file mode 100644 index 000000000..d4465e7d7 --- /dev/null +++ b/k8s/base/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: project-echo + labels: + app.kubernetes.io/name: project-echo diff --git a/k8s/base/networkpolicy.yaml b/k8s/base/networkpolicy.yaml new file mode 100644 index 000000000..5b0892faa --- /dev/null +++ b/k8s/base/networkpolicy.yaml @@ -0,0 +1,31 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: restrict-default + namespace: project-echo +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: project-echo + egress: + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: project-echo + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + - to: + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - port: 443 + protocol: TCP diff --git a/k8s/base/redis-service.yaml b/k8s/base/redis-service.yaml new file mode 100644 index 000000000..d236d8046 --- /dev/null +++ b/k8s/base/redis-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: project-echo + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: cache +spec: + clusterIP: None + selector: + app.kubernetes.io/name: redis + ports: + - name: redis + port: 6379 + targetPort: redis + protocol: TCP diff --git a/k8s/base/redis-statefulset.yaml b/k8s/base/redis-statefulset.yaml new file mode 100644 index 000000000..f7cfb10da --- /dev/null +++ b/k8s/base/redis-statefulset.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis + namespace: project-echo + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: cache +spec: + serviceName: redis + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis + template: + metadata: + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: cache + spec: + containers: + - name: redis + image: redis:7.2 + imagePullPolicy: IfNotPresent + ports: + - name: redis + containerPort: 6379 + env: + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: redis-secrets + key: REDIS_PASSWORD + command: + - sh + - -c + - | + exec redis-server --requirepass "$REDIS_PASSWORD" --save "" + readinessProbe: + tcpSocket: + port: redis + initialDelaySeconds: 10 + periodSeconds: 5 + livenessProbe: + tcpSocket: + port: redis + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: redis-data + mountPath: /data + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 250m + memory: 512Mi + volumeClaimTemplates: + - metadata: + name: redis-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: standard-rwo diff --git a/k8s/base/secrets.yaml b/k8s/base/secrets.yaml new file mode 100644 index 000000000..57ef77c30 --- /dev/null +++ b/k8s/base/secrets.yaml @@ -0,0 +1,47 @@ +apiVersion: v1 +kind: List +metadata: + resourceVersion: "" +items: + - apiVersion: v1 + kind: Secret + metadata: + name: api-secrets + namespace: project-echo + annotations: + secret.kubernetes.io/managed-by: external + type: Opaque + stringData: + MONGO_URI: "" + REDIS_URL: "" + - apiVersion: v1 + kind: Secret + metadata: + name: engine-secrets + namespace: project-echo + annotations: + secret.kubernetes.io/managed-by: external + type: Opaque + stringData: + MODEL_BUCKET: "" + - apiVersion: v1 + kind: Secret + metadata: + name: mongo-secrets + namespace: project-echo + annotations: + secret.kubernetes.io/managed-by: external + type: Opaque + stringData: + ROOT_USERNAME: "" + ROOT_PASSWORD: "" + - apiVersion: v1 + kind: Secret + metadata: + name: redis-secrets + namespace: project-echo + annotations: + secret.kubernetes.io/managed-by: external + type: Opaque + stringData: + REDIS_PASSWORD: "" diff --git a/k8s/base/serviceaccounts.yaml b/k8s/base/serviceaccounts.yaml new file mode 100644 index 000000000..814b2aad8 --- /dev/null +++ b/k8s/base/serviceaccounts.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: List +metadata: + resourceVersion: "" + continue: "" +items: + - apiVersion: v1 + kind: ServiceAccount + metadata: + name: api + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: placeholder-api@PROJECT_ID.iam.gserviceaccount.com + - apiVersion: v1 + kind: ServiceAccount + metadata: + name: engine + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: placeholder-engine@PROJECT_ID.iam.gserviceaccount.com + - apiVersion: v1 + kind: ServiceAccount + metadata: + name: hmi + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: placeholder-hmi@PROJECT_ID.iam.gserviceaccount.com diff --git a/k8s/overlays/dev/kustomization.yaml b/k8s/overlays/dev/kustomization.yaml new file mode 100644 index 000000000..ff39f95ff --- /dev/null +++ b/k8s/overlays/dev/kustomization.yaml @@ -0,0 +1,54 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: project-echo + +resources: + - ../../base + - managedcertificate.yaml + +images: + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/api + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/api + newTag: dev + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/engine + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/engine + newTag: dev + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/hmi + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/hmi + newTag: dev + +configMapGenerator: + - name: project-echo-config + behavior: merge + literals: + - APP_ENV=dev + - API_BASE_PATH=https://api.dev.echo.example.com + - LOG_LEVEL=DEBUG + +secretGenerator: + - name: api-secrets + behavior: merge + literals: + - MONGO_URI=mongodb://mongo.project-echo.svc.cluster.local:27017/echo + - REDIS_URL=redis://:changeme@redis.project-echo.svc.cluster.local:6379/0 + - name: engine-secrets + behavior: merge + literals: + - MODEL_BUCKET=gs://sit-23t1-project-echo-25288b9-echonet-models-stg + - name: mongo-secrets + behavior: merge + literals: + - ROOT_USERNAME=admin + - ROOT_PASSWORD=changeme + - name: redis-secrets + behavior: merge + literals: + - REDIS_PASSWORD=changeme + +patches: + - path: patches/serviceaccounts.yaml + - path: patches/ingress.yaml + +generatorOptions: + disableNameSuffixHash: true diff --git a/k8s/overlays/dev/managedcertificate.yaml b/k8s/overlays/dev/managedcertificate.yaml new file mode 100644 index 000000000..987c3cf14 --- /dev/null +++ b/k8s/overlays/dev/managedcertificate.yaml @@ -0,0 +1,8 @@ +apiVersion: networking.gke.io/v1 +kind: ManagedCertificate +metadata: + name: project-echo-dev-cert + namespace: project-echo +spec: + domains: + - dev.echo.example.com diff --git a/k8s/overlays/dev/patches/ingress.yaml b/k8s/overlays/dev/patches/ingress.yaml new file mode 100644 index 000000000..f7a94f545 --- /dev/null +++ b/k8s/overlays/dev/patches/ingress.yaml @@ -0,0 +1,26 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: project-echo + namespace: project-echo + annotations: + networking.gke.io/managed-certificates: project-echo-dev-cert +spec: + rules: + - host: dev.echo.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hmi + port: + name: http + - path: /api + pathType: Prefix + backend: + service: + name: api + port: + name: http diff --git a/k8s/overlays/dev/patches/serviceaccounts.yaml b/k8s/overlays/dev/patches/serviceaccounts.yaml new file mode 100644 index 000000000..196a0d072 --- /dev/null +++ b/k8s/overlays/dev/patches/serviceaccounts.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: api + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: dev-api@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: engine + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: dev-engine@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: hmi + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: dev-hmi@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com diff --git a/k8s/overlays/prod/kustomization.yaml b/k8s/overlays/prod/kustomization.yaml new file mode 100644 index 000000000..6a0c123a2 --- /dev/null +++ b/k8s/overlays/prod/kustomization.yaml @@ -0,0 +1,54 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: project-echo + +resources: + - ../../base + - managedcertificate.yaml + +images: + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/api + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/api + newTag: prod + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/engine + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/engine + newTag: prod + - name: REGION-docker.pkg.dev/PROJECT_ID/echonet/hmi + newName: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/hmi + newTag: prod + +configMapGenerator: + - name: project-echo-config + behavior: merge + literals: + - APP_ENV=prod + - API_BASE_PATH=https://api.echo.example.com + - LOG_LEVEL=INFO + +secretGenerator: + - name: api-secrets + behavior: merge + literals: + - MONGO_URI=mongodb://mongo.project-echo.svc.cluster.local:27017/echo + - REDIS_URL=redis://:strongpassword@redis.project-echo.svc.cluster.local:6379/0 + - name: engine-secrets + behavior: merge + literals: + - MODEL_BUCKET=gs://sit-23t1-project-echo-25288b9-echonet-models-prod + - name: mongo-secrets + behavior: merge + literals: + - ROOT_USERNAME=admin + - ROOT_PASSWORD=changeme + - name: redis-secrets + behavior: merge + literals: + - REDIS_PASSWORD=strongpassword + +patches: + - path: patches/serviceaccounts.yaml + - path: patches/ingress.yaml + +generatorOptions: + disableNameSuffixHash: true diff --git a/k8s/overlays/prod/managedcertificate.yaml b/k8s/overlays/prod/managedcertificate.yaml new file mode 100644 index 000000000..46021b9d1 --- /dev/null +++ b/k8s/overlays/prod/managedcertificate.yaml @@ -0,0 +1,8 @@ +apiVersion: networking.gke.io/v1 +kind: ManagedCertificate +metadata: + name: project-echo-prod-cert + namespace: project-echo +spec: + domains: + - echo.example.com diff --git a/k8s/overlays/prod/patches/ingress.yaml b/k8s/overlays/prod/patches/ingress.yaml new file mode 100644 index 000000000..61fb65cf3 --- /dev/null +++ b/k8s/overlays/prod/patches/ingress.yaml @@ -0,0 +1,26 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: project-echo + namespace: project-echo + annotations: + networking.gke.io/managed-certificates: project-echo-prod-cert +spec: + rules: + - host: echo.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hmi + port: + name: http + - path: /api + pathType: Prefix + backend: + service: + name: api + port: + name: http diff --git a/k8s/overlays/prod/patches/serviceaccounts.yaml b/k8s/overlays/prod/patches/serviceaccounts.yaml new file mode 100644 index 000000000..1b1b49c2b --- /dev/null +++ b/k8s/overlays/prod/patches/serviceaccounts.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: api + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: prod-api@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: engine + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: prod-engine@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: hmi + namespace: project-echo + annotations: + iam.gke.io/gcp-service-account: prod-hmi@sit-23t1-project-echo-25288b9.iam.gserviceaccount.com diff --git a/kind-config.yaml b/kind-config.yaml new file mode 100644 index 000000000..857990310 --- /dev/null +++ b/kind-config.yaml @@ -0,0 +1,8 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + extraPortMappings: + - containerPort: 6443 + hostPort: 6443 + protocol: TCP diff --git a/metallb-config.yaml b/metallb-config.yaml new file mode 100644 index 000000000..11884204e --- /dev/null +++ b/metallb-config.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: metallb-system + name: config +data: + config: | + address-pools: + - name: default + protocol: layer2 + addresses: + - 172.18.255.200-172.18.255.210 diff --git a/metallb-new-config.yaml b/metallb-new-config.yaml new file mode 100644 index 000000000..1e103409b --- /dev/null +++ b/metallb-new-config.yaml @@ -0,0 +1,17 @@ +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: default + namespace: metallb-system +spec: + addresses: + - 172.18.255.200-172.18.255.210 +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: default + namespace: metallb-system +spec: + ipAddressPools: + - default diff --git a/src/Components/API/app/main.py b/src/Components/API/app/main.py index 423b29a30..2395677cf 100644 --- a/src/Components/API/app/main.py +++ b/src/Components/API/app/main.py @@ -13,6 +13,18 @@ from typing import Optional, List import datetime import pymongo +import json + +from app.routers import hmi, engine, sim, two_factor +from app.routers import public +from app.routers import cloud_compute +app = FastAPI() + +# Add the CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:8080"], # 可根据实际需求配置 +) # Routers from .routers import add_csv_output_option, audio_upload_router @@ -49,14 +61,33 @@ app.include_router(public.router, tags=['public'], prefix='/public') app.include_router(iot.router, tags=['iot'], prefix='/iot') app.include_router(species_predictor.router, tags=["predict"]) -app.include_router(auth_router.router, tags=["auth"], prefix="/api") -app.include_router(live.router) #Websocket +app.include_router(cloud_compute.router, tags=['cloud'], prefix='/cloud') # --- Root Endpoint --- @app.get("/", response_description="API Root") def show_home(): return 'Welcome to echo api, move to /docs for more' +# ✅ Login endpoint (redirect or placeholder) +@app.get("/login", response_description="Login Page") +def login_page(): + return { + "message": "Login endpoint", + "status": "ok", + "redirect": "/api/auth/signin", + "note": "Use POST /api/auth/signin with credentials for authentication" + } + +@app.post("/login", response_description="Login API Endpoint") +async def login(username: str = None, password: str = None): + """ + Login endpoint that redirects to signin + """ + if not username or not password: + return {"error": "Username and password required"} + # Redirect to the actual signin endpoint + return {"message": "Please use POST /api/auth/signin instead"} + app.include_router(auth_router.router, tags=["auth"], prefix="/api") from app.routers import detections app.include_router(detections.router) diff --git a/src/Components/API/app/routers/auth_router.py b/src/Components/API/app/routers/auth_router.py index 6c6ed35ce..17065cde6 100644 --- a/src/Components/API/app/routers/auth_router.py +++ b/src/Components/API/app/routers/auth_router.py @@ -1,40 +1,105 @@ from fastapi import APIRouter, HTTPException, status from pydantic import BaseModel +from typing import Optional +import bcrypt from app.database import User from app.middleware.auth import signJWT from app.middleware.random import genotp import time +import requests +import os router = APIRouter() otp_store = {} +# reCAPTCHA configuration +RECAPTCHA_SECRET_KEY = os.getenv("RECAPTCHA_SECRET_KEY", "6Lee1k0sAAAAAH33-o7w2ghN5suNAD8UMkP5lOOT") +RECAPTCHA_VERIFY_URL = "https://www.google.com/recaptcha/api/siteverify" + class SignInRequest(BaseModel): - email: str + username: Optional[str] = None + email: Optional[str] = None password: str + recaptchaToken: Optional[str] = None # Optional for backward compatibility class OTPVerifyRequest(BaseModel): email: str otp: str +def verify_recaptcha(token: str) -> bool: + """Verify reCAPTCHA token with Google""" + if not token: + return False + + try: + response = requests.post( + RECAPTCHA_VERIFY_URL, + data={ + "secret": RECAPTCHA_SECRET_KEY, + "response": token + } + ) + result = response.json() + # For reCAPTCHA v3, check the score (0.0 to 1.0) + # Higher score = more likely legitimate, lower = more suspicious + return result.get("success", False) and result.get("score", 0) > 0.5 + except Exception as e: + print(f"reCAPTCHA verification error: {e}") + return False + @router.post("/signin") def signin(data: SignInRequest): - user = User.find_one({"email": data.email}) - if not user or user["password"] != data.password: + # Verify reCAPTCHA token if provided + if data.recaptchaToken: + if not verify_recaptcha(data.recaptchaToken): + raise HTTPException(status_code=403, detail="reCAPTCHA verification failed") + + # Check that at least email or username is provided + if not data.email and not data.username: + raise HTTPException(status_code=400, detail="Either email or username is required") + + # Find user by email or username + query = {} + if data.email: + query["email"] = data.email + if data.username: + query["username"] = data.username + + user = User.find_one(query) if query else None + + if not user: + raise HTTPException(status_code=401, detail="Invalid credentials") + + # Verify password using bcrypt + try: + stored_password = user.get("password", "") + # Check if password is already hashed (bcrypt hashes start with $2a$, $2b$, or $2y$) + if stored_password.startswith(('$2a$', '$2b$', '$2y$')): + # Password is hashed, use bcrypt + is_valid = bcrypt.checkpw(data.password.encode('utf-8'), stored_password.encode('utf-8')) + else: + # Fallback to plaintext comparison (for backwards compatibility) + is_valid = (stored_password == data.password) + + if not is_valid: + raise HTTPException(status_code=401, detail="Invalid credentials") + except Exception as e: + print(f"Password verification error: {e}") raise HTTPException(status_code=401, detail="Invalid credentials") otp = genotp() - otp_store[data.email] = { + otp_store[user.get("email", data.email or data.username)] = { "otp": otp, "timestamp": time.time(), "user": user } - print(f"[DEBUG] OTP for {data.email} is {otp}") # Simulate email + print(f"[DEBUG] OTP for {user.get('email')} is {otp}") # Simulate email - return {"message": "OTP sent to email"} + return {"message": "OTP sent to email", "email": user.get("email")} @router.post("/verify-otp") def verify_otp(data: OTPVerifyRequest): diff --git a/src/Components/API/app/routers/cloud_compute.py b/src/Components/API/app/routers/cloud_compute.py new file mode 100644 index 000000000..7cfead290 --- /dev/null +++ b/src/Components/API/app/routers/cloud_compute.py @@ -0,0 +1,402 @@ +""" +Cloud Compute Router +Provides endpoints for fetching Google Cloud Platform information +""" + +from fastapi import APIRouter, HTTPException +from google.cloud import compute_v1 +from google.cloud import billing_v1 +from google.cloud import bigquery +from google.cloud import monitoring_v3 +from typing import List, Dict, Any +from datetime import datetime, timedelta +import calendar +import os + +router = APIRouter() + +def get_billing_account_name(project_id: str) -> str: + """Get the billing account name for a project""" + try: + client = billing_v1.CloudBillingClient() + project_billing_info = client.get_project_billing_info(name=f"projects/{project_id}") + return project_billing_info.billing_account_name + except Exception as e: + print(f"Error getting billing account: {e}") + return None + +def get_cpu_utilization(project_id: str, zone: str = "australia-southeast1-a") -> float: + """Get average CPU utilization for all instances in the project""" + try: + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + + # Query for last 5 minutes of CPU data + now = datetime.utcnow() + interval = monitoring_v3.TimeInterval({ + "end_time": {"seconds": int(now.timestamp())}, + "start_time": {"seconds": int((now - timedelta(minutes=5)).timestamp())}, + }) + + # CPU utilization metric + results = client.list_time_series( + request={ + "name": project_name, + "filter": 'metric.type="compute.googleapis.com/instance/cpu/utilization"', + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + } + ) + + # Average all instances' CPU usage + cpu_values = [] + for result in results: + for point in result.points: + cpu_values.append(point.value.double_value * 100) # Convert to percentage + + if cpu_values: + return round(sum(cpu_values) / len(cpu_values), 1) + return None + + except Exception as e: + print(f"Error fetching CPU metrics: {e}") + return None + +def get_disk_utilization(project_id: str, zone: str = "australia-southeast1-a") -> dict: + """Get disk utilization for all instances in the project""" + try: + # Try to get disk metrics from agent + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + + now = datetime.utcnow() + interval = monitoring_v3.TimeInterval({ + "end_time": {"seconds": int(now.timestamp())}, + "start_time": {"seconds": int((now - timedelta(minutes=5)).timestamp())}, + }) + + # Disk utilization metric (percentage) + results = client.list_time_series( + request={ + "name": project_name, + "filter": 'metric.type="agent.googleapis.com/disk/percent_used"', + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + } + ) + + disk_values = [] + for result in results: + for point in result.points: + disk_values.append(point.value.double_value) + + if disk_values: + avg_percent = round(sum(disk_values) / len(disk_values), 1) + # Get disk size from instance + compute_client = compute_v1.InstancesClient() + instances = list(compute_client.list(project=project_id, zone=zone)) + + total_gb = 30 # Default boot disk size + if instances: + for disk in instances[0].disks: + if disk.boot: + total_gb = disk.disk_size_gb + break + + used_gb = round((avg_percent / 100) * total_gb, 1) + return { + "used_gb": used_gb, + "total_gb": total_gb, + "percent": avg_percent + } + + # Fallback: Get disk size from instance configuration + compute_client = compute_v1.InstancesClient() + instances = list(compute_client.list(project=project_id, zone=zone)) + + if instances: + total_gb = 30 # Default + for disk in instances[0].disks: + if disk.boot: + total_gb = disk.disk_size_gb + break + + # Estimate ~40% usage without agent + estimated_used = round(total_gb * 0.40, 1) + return { + "used_gb": estimated_used, + "total_gb": total_gb, + "percent": 40.0, + "estimated": True + } + + return None + + except Exception as e: + print(f"Error fetching disk metrics: {e}") + return None + +def get_memory_utilization(project_id: str, zone: str = "australia-southeast1-a") -> dict: + """Get memory utilization for all instances in the project""" + try: + # First try agent memory metrics + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + + # Query for last 5 minutes of memory data + now = datetime.utcnow() + interval = monitoring_v3.TimeInterval({ + "end_time": {"seconds": int(now.timestamp())}, + "start_time": {"seconds": int((now - timedelta(minutes=5)).timestamp())}, + }) + + # Memory utilization metric (percentage) + results = client.list_time_series( + request={ + "name": project_name, + "filter": 'metric.type="agent.googleapis.com/memory/percent_used" AND metric.label.state="used"', + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + } + ) + + # Average memory usage percentage + memory_values = [] + for result in results: + for point in result.points: + memory_values.append(point.value.double_value) + + if memory_values: + avg_percent = round(sum(memory_values) / len(memory_values), 1) + # Get actual instance memory from machine type + compute_client = compute_v1.InstancesClient() + instances = compute_client.list(project=project_id, zone=zone) + + total_gb = 16 # Default + for instance in instances: + machine_type = instance.machine_type.split('/')[-1] + # Parse memory from machine type (e.g., e2-medium = 4GB, n1-standard-1 = 3.75GB) + if 'e2-medium' in machine_type: + total_gb = 4 + elif 'e2-small' in machine_type: + total_gb = 2 + elif 'n1-standard-1' in machine_type: + total_gb = 3.75 + elif 'n2-standard-2' in machine_type: + total_gb = 8 + break + + used_gb = round((avg_percent / 100) * total_gb, 1) + return { + "used_gb": used_gb, + "total_gb": total_gb, + "percent": avg_percent + } + + # Fallback: Try to get instance info and show based on machine type + compute_client = compute_v1.InstancesClient() + instances = list(compute_client.list(project=project_id, zone=zone)) + + if instances: + machine_type = instances[0].machine_type.split('/')[-1] + total_gb = 4 # Default for e2-medium + + # Estimate memory based on machine type + if 'e2-medium' in machine_type: + total_gb = 4 + elif 'e2-small' in machine_type: + total_gb = 2 + elif 'n1-standard-1' in machine_type: + total_gb = 3.75 + elif 'n2-standard-2' in machine_type: + total_gb = 8 + + # Without agent, estimate ~30-40% usage + estimated_used = round(total_gb * 0.35, 1) + return { + "used_gb": estimated_used, + "total_gb": total_gb, + "percent": 35.0, + "estimated": True + } + + return None + + except Exception as e: + print(f"Error fetching memory metrics: {e}") + return None + +@router.get("/cloud-info") +async def get_cloud_info(): + """ + Fetch Google Cloud Compute Engine instance details + + Returns: + dict: Dictionary containing list of instances with their details + """ + try: + # Initialize Google Cloud client lazily + client = compute_v1.InstancesClient() + + # Google Cloud project configuration + project = "sit-23t1-project-echo-25288b9" + zone = "australia-southeast1-b" + + # Fetch instance details + instances = client.list(project=project, zone=zone) + instance_list = [ + { + "name": instance.name, + "status": instance.status, + "machine_type": instance.machine_type, + "zone": zone, + } + for instance in instances + ] + + return { + "success": True, + "instances": instance_list, + "count": len(instance_list) + } + except Exception as e: + # Return dummy data if credentials are not available + return { + "success": False, + "message": "Using dummy data - Google Cloud credentials not configured", + "instances": [ + { + "name": "instance-1", + "status": "RUNNING", + "machine_type": "e2-medium", + "zone": "us-central1-a", + }, + { + "name": "instance-2", + "status": "RUNNING", + "machine_type": "n1-standard-1", + "zone": "us-central1-a", + } + ], + "count": 2, + "error": str(e) + } + +@router.get("/cloud-metrics") +async def get_cloud_metrics(): + """ + Fetch aggregated cloud metrics including real-time billing information + + Returns: + dict: Dictionary containing CPU, memory, storage, and billing metrics + """ + try: + project_id = "sit-23t1-project-echo-25288b9" + + # Fetch real CPU utilization from Cloud Monitoring + cpu_usage = get_cpu_utilization(project_id) + if cpu_usage is None: + cpu_usage = 45 # Fallback value + + # Fetch real memory utilization from Cloud Monitoring + memory_data = get_memory_utilization(project_id) + if memory_data: + memory_usage = f"{memory_data['used_gb']} / {memory_data['total_gb']} GB" + memory_percent = memory_data['percent'] + else: + memory_usage = "6 / 16 GB" # Fallback value + memory_percent = 37.5 + + # Fetch real disk utilization from Cloud Monitoring + disk_data = get_disk_utilization(project_id) + if disk_data: + storage_usage = f"{disk_data['used_gb']} / {disk_data['total_gb']} GB" + else: + storage_usage = "1.3 / 3 TB" # Fallback value + + # Attempt to get billing account (for display) + billing_account = get_billing_account_name(project_id) + + # Try querying BigQuery billing export if configured + dataset = os.getenv("BILLING_DATASET") + table = os.getenv("BILLING_TABLE") # e.g., gcp_billing_export_v1 + bq_project = os.getenv("BILLING_PROJECT_ID", project_id) + + month_cost = None + forecast_cost = None + currency = "AUD" + last_updated = None + + if dataset and table: + try: + client = bigquery.Client(project=bq_project) + + # Compute current month window + now = datetime.utcnow() + start_of_month = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + days_in_month = calendar.monthrange(now.year, now.month)[1] + day_of_month = now.day + + query = ( + f""" + SELECT + ROUND(SUM(cost), 2) AS month_to_date_cost, + ANY_VALUE(currency) AS currency, + MAX(usage_end_time) AS last_usage_time + FROM `{bq_project}.{dataset}.{table}` + WHERE project.id = @project_id + AND usage_start_time >= @start + AND usage_start_time < @end + """ + ) + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("project_id", "STRING", project_id), + bigquery.ScalarQueryParameter("start", "TIMESTAMP", start_of_month), + bigquery.ScalarQueryParameter("end", "TIMESTAMP", now), + ] + ) + + result = client.query(query, job_config=job_config).result() + for row in result: + month_cost = float(row["month_to_date_cost"]) if row["month_to_date_cost"] is not None else 0.0 + currency = row["currency"] or currency + last_updated = row["last_usage_time"].isoformat() if row["last_usage_time"] else None + + if month_cost is not None: + # Simple forecast: linear projection based on average daily spend + avg_per_day = month_cost / max(day_of_month, 1) + forecast_val = round(avg_per_day * days_in_month, 2) + forecast_cost = forecast_val + + except Exception as bq_err: + print(f"BigQuery billing query error: {bq_err}") + + # Build response, using live values if available, else fallback placeholders + current_cost_str = ( + f"${month_cost:.2f} {currency}" if month_cost is not None else "$61.97 AUD" + ) + forecast_cost_str = ( + f"${forecast_cost:.2f} {currency}" if forecast_cost is not None else "$645.00 AUD" + ) + + payload = { + "success": True if month_cost is not None else False, + "message": None if month_cost is not None else "Billing export not configured; using placeholders", + "metrics": { + "cpu_usage": cpu_usage, + "memory_usage": memory_usage, + "storage_usage": storage_usage, + "cpu_trend": "Rising" if cpu_usage and cpu_usage > 50 else "Stable", + "cost_current": current_cost_str, + "cost_forecast": forecast_cost_str, + "billing_account": billing_account, + "last_updated": last_updated, + }, + } + + return payload + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error fetching cloud metrics: {str(e)}") diff --git a/src/Components/API/backend/project-echo-openapi.json b/src/Components/API/backend/project-echo-openapi.json index 8cbd7a2c7..c3f4b4a73 100644 --- a/src/Components/API/backend/project-echo-openapi.json +++ b/src/Components/API/backend/project-echo-openapi.json @@ -1920,6 +1920,123 @@ } } }, + "/cloud/cloud-info": { + "get": { + "tags": [ + "cloud" + ], + "summary": "Get Cloud Info", + "description": "Fetch Google Cloud Compute Engine instance details\n\nReturns:\n dict: Dictionary containing list of instances with their details", + "operationId": "get_cloud_info_cloud_cloud_info_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/cloud/cloud-metrics": { + "get": { + "tags": [ + "cloud" + ], + "summary": "Get Cloud Metrics", + "description": "Fetch aggregated cloud metrics including real-time billing information\n\nReturns:\n dict: Dictionary containing CPU, memory, storage, and billing metrics", + "operationId": "get_cloud_metrics_cloud_cloud_metrics_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/": { + "get": { + "summary": "Show Home", + "operationId": "show_home__get", + "responses": { + "200": { + "description": "API Root", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/login": { + "get": { + "summary": "Login Page", + "operationId": "login_page_login_get", + "responses": { + "200": { + "description": "Login Page", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "summary": "Login", + "description": "Login endpoint that redirects to signin", + "operationId": "login_login_post", + "parameters": [ + { + "required": false, + "schema": { + "title": "Username", + "type": "string" + }, + "name": "username", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Password", + "type": "string" + }, + "name": "password", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Login API Endpoint", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, "/api/signin": { "post": { "tags": [ @@ -3057,21 +3174,27 @@ }, "type": "object", "required": [ - "user", - "password", - "otp" + "password" ], "title": "ResetPasswordSchema" }, "SignInRequest": { "properties": { + "username": { + "title": "Username", + "type": "string" + }, "email": { "type": "string", "title": "Email" }, "password": { - "type": "string", - "title": "Password" + "title": "Password", + "type": "string" + }, + "recaptchaToken": { + "title": "Recaptchatoken", + "type": "string" } }, "type": "object", diff --git a/src/Components/API/requirements.txt b/src/Components/API/requirements.txt index 84065de98..4ab7d060a 100644 --- a/src/Components/API/requirements.txt +++ b/src/Components/API/requirements.txt @@ -30,4 +30,7 @@ pandas Werkzeug==2.3.7 fastapi_mail twilio==8.5.0 -python-multipart \ No newline at end of file +google-cloud-compute==1.14.1 +google-cloud-billing==1.13.1 +google-cloud-bigquery==3.25.0 +google-cloud-monitoring==2.15.1 diff --git a/src/Components/HMI/HMI.Dockerfile b/src/Components/HMI/HMI.Dockerfile index 126d89180..4d403c7d8 100644 --- a/src/Components/HMI/HMI.Dockerfile +++ b/src/Components/HMI/HMI.Dockerfile @@ -7,11 +7,9 @@ WORKDIR /usr/src/app/ui # This prevents re-installing node_modules. COPY ui/package*.json ./ -# Install Production Dependencies Only -# 'npm ci' is faster and more reliable than 'install'. -# '--omit=dev' skips devDependencies (saves huge space). -# 'npm cache clean' removes the installation cache. -RUN npm ci --omit=dev && \ +# Install All Dependencies (dev and production) +# Use npm install for flexibility with all dependencies +RUN npm install && \ npm cache clean --force COPY ui/ . diff --git a/src/Components/HMI/ui/Dockerfile b/src/Components/HMI/ui/Dockerfile new file mode 100644 index 000000000..044f7f496 --- /dev/null +++ b/src/Components/HMI/ui/Dockerfile @@ -0,0 +1,18 @@ +FROM node:18-slim + +WORKDIR /usr/src/app + +# Copy package files +COPY package*.json ./ + +# Install dependencies +RUN npm ci --only=production + +# Copy application code +COPY . . + +# Expose port +EXPOSE 3000 + +# Start the application +CMD ["node", "server.js"] diff --git a/src/Components/HMI/ui/public/admin/cloud-compute.html b/src/Components/HMI/ui/public/admin/cloud-compute.html index 16442191e..05e896028 100644 --- a/src/Components/HMI/ui/public/admin/cloud-compute.html +++ b/src/Components/HMI/ui/public/admin/cloud-compute.html @@ -252,7 +252,7 @@
+ diff --git a/src/Components/HMI/ui/public/admin/js/cloud-compute.js b/src/Components/HMI/ui/public/admin/js/cloud-compute.js index 766ad038b..9f0d18734 100644 --- a/src/Components/HMI/ui/public/admin/js/cloud-compute.js +++ b/src/Components/HMI/ui/public/admin/js/cloud-compute.js @@ -37,9 +37,76 @@ document.addEventListener('DOMContentLoaded', () => { } // - // 2. DUMMY METRICS FOR THE TOP CARDS + // 2. FETCH METRICS FROM BACKEND API // - function initDummyMetrics() { + async function initDummyMetrics() { + try { + // Use relative paths - HMI server proxies to API + + // Fetch metrics data from the backend API (via HMI proxy) + const metricsResponse = await fetch('/cloud/cloud-metrics'); + const metricsData = await metricsResponse.json(); + + // Fetch instance data + const instanceResponse = await fetch('/cloud/cloud-info'); + const instanceData = await instanceResponse.json(); + + // Update metrics with real data (metrics are returned even if success is false) + if (metricsData.metrics) { + const metrics = metricsData.metrics; + + // Update CPU + document.getElementById('cpu-usage-value').textContent = metrics.cpu_usage + '%'; + document.getElementById('cpu-trend-text').textContent = 'Trend: ' + metrics.cpu_trend; + + // Update Memory + document.getElementById('memory-usage-value').textContent = metrics.memory_usage; + document.getElementById('memory-load-text').textContent = 'Avg load: Medium'; + + // Update Storage + document.getElementById('storage-usage-value').textContent = metrics.storage_usage; + document.getElementById('storage-growth-text').textContent = 'Growth: 8% weekly'; + document.getElementById('storage-forecast-text').textContent = 'Forecast: Full in 3 months'; + + // Update Billing + console.log('Updating billing - Current:', metrics.cost_current, 'Forecast:', metrics.cost_forecast); + const currentCostElem = document.getElementById('current-cost-amount'); + const forecastCostElem = document.getElementById('forecast-cost-amount'); + + if (currentCostElem) { + currentCostElem.textContent = metrics.cost_current; + console.log('Updated current-cost-amount to:', metrics.cost_current); + } + if (forecastCostElem) { + forecastCostElem.textContent = metrics.cost_forecast; + console.log('Updated forecast-cost-amount to:', metrics.cost_forecast); + } + + // Show billing account if available + if (metrics.billing_account) { + console.log('Billing Account:', metrics.billing_account); + } + + // Show message if available + if (metricsData.message) { + console.log('API Message:', metricsData.message); + } + + console.log('Fetched metrics data:', metricsData); + console.log('Fetched instance data:', instanceData); + + // Update charts with live data + initLiveCharts(metrics); + } + } catch (error) { + console.error('Error fetching cloud info:', error); + // Fallback to dummy data if API call fails + useDummyData(); + initDummyCharts(); + } + } + + function useDummyData() { // Dummy values used for demonstration before API integration const dummyMetrics = { cpuUsage: 68, @@ -69,17 +136,35 @@ document.addEventListener('DOMContentLoaded', () => { } // - // 3. DUMMY CHARTS USING APEXCHARTS + // 3. LIVE CHARTS USING APEXCHARTS WITH REAL DATA // - function initDummyCharts() { + function initLiveCharts(metrics) { if (typeof ApexCharts === 'undefined') { console.warn('ApexCharts is not available'); return; } + // Parse current values from metrics + const currentCpu = metrics.cpu_usage || 12; + const currentMemory = parseFloat(metrics.memory_usage.split('/')[0]) || 1.4; + const memoryTotal = parseFloat(metrics.memory_usage.split('/')[1]) || 4; + const memoryPercent = (currentMemory / memoryTotal * 100).toFixed(1); + + const currentStorage = parseFloat(metrics.storage_usage.split('/')[0]) || 40; + const storageTotal = parseFloat(metrics.storage_usage.split('/')[1]) || 100; + const storagePercent = (currentStorage / storageTotal * 100).toFixed(1); + // ----- Usage Chart (CPU / Memory / Storage) ----- + // Since we don't have historical data, simulate a trend around current values const usageChartElement = document.querySelector('#usage-chart'); if (usageChartElement) { + // Clear the placeholder text before rendering chart + usageChartElement.innerHTML = ''; + + const cpuData = generateTrend(currentCpu, 6); + const memoryData = generateTrend(parseFloat(memoryPercent), 6); + const storageData = generateTrend(parseFloat(storagePercent), 6); + const usageOptions = { chart: { type: 'line', @@ -87,12 +172,24 @@ document.addEventListener('DOMContentLoaded', () => { toolbar: { show: false } }, series: [ - { name: 'CPU', data: [40, 55, 50, 65, 60, 70] }, - { name: 'Memory', data: [60, 62, 64, 66, 67, 68] }, - { name: 'Storage', data: [30, 32, 34, 35, 36, 37] } + { name: 'CPU %', data: cpuData }, + { name: 'Memory %', data: memoryData }, + { name: 'Storage %', data: storageData } ], + stroke: { + curve: 'smooth', + width: 2 + }, + markers: { + size: 4 + }, xaxis: { - categories: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] + categories: ['5 min ago', '4 min ago', '3 min ago', '2 min ago', '1 min ago', 'Now'] + }, + yaxis: { + title: { text: 'Usage (%)' }, + min: 0, + max: 100 } }; @@ -103,9 +200,12 @@ document.addEventListener('DOMContentLoaded', () => { // ----- Cost Chart ----- const costChartElement = document.querySelector('#cost-chart'); if (costChartElement) { + // Clear the placeholder text before rendering chart + costChartElement.innerHTML = ''; + const costOptions = { chart: { - type: 'area', + type: 'line', height: 220, toolbar: { show: false } }, @@ -115,6 +215,104 @@ document.addEventListener('DOMContentLoaded', () => { data: [200, 230, 250, 280, 310, 340] // dummy cost trend } ], + stroke: { + curve: 'smooth', + width: 2 + }, + markers: { + size: 4 + }, + xaxis: { + categories: ['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5', 'Week 6'] + } + }; + + const costChart = new ApexCharts(costChartElement, costOptions); + costChart.render(); + } + } + + // Helper function to generate realistic trend data around a current value + function generateTrend(currentValue, points) { + const data = []; + const variance = currentValue * 0.15; // 15% variance + + for (let i = 0; i < points; i++) { + let value; + if (i === points - 1) { + // Last point is the current value + value = currentValue; + } else { + // Generate values with slight variations + const offset = (Math.random() - 0.5) * variance; + value = currentValue + offset; + } + data.push(parseFloat(value.toFixed(1))); + } + + return data; + } + + // + // 4. DUMMY CHARTS USING APEXCHARTS (FALLBACK) + // + function initDummyCharts() { + if (typeof ApexCharts === 'undefined') { + console.warn('ApexCharts is not available'); + return; + } + + // ----- Usage Chart (CPU / Memory / Storage) ----- + const usageChartElement = document.querySelector('#usage-chart'); + if (usageChartElement) { + // Clear the placeholder text + usageChartElement.innerHTML = ''; + + const usageOptions = { + chart: { + type: 'line', + height: 250, + toolbar: { show: false } + }, + series: [ + { name: 'CPU %', data: [40, 55, 50, 65, 60, 70] }, + { name: 'Memory %', data: [60, 62, 64, 66, 67, 68] }, + { name: 'Storage %', data: [30, 32, 34, 35, 36, 37] } + ], + xaxis: { + categories: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] + } + }; + + const usageChart = new ApexCharts(usageChartElement, usageOptions); + usageChart.render(); + } + + // ----- Cost Chart ----- + const costChartElement = document.querySelector('#cost-chart'); + if (costChartElement) { + // Clear the placeholder text + costChartElement.innerHTML = ''; + + const costOptions = { + chart: { + type: 'line', + height: 220, + toolbar: { show: false } + }, + series: [ + { + name: 'Cost (AUD)', + data: [200, 230, 250, 280, 310, 340] + } + ], + stroke: { + curve: 'smooth', + width: 2 + }, + markers: { + size: 4 + }, xaxis: { categories: ['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5', 'Week 6'] } diff --git a/src/Components/HMI/ui/public/js/HMI_API_onboarding_task.json b/src/Components/HMI/ui/public/js/HMI_API_onboarding_task.json index 5f3b8c4fd..0357e37b9 100644 --- a/src/Components/HMI/ui/public/js/HMI_API_onboarding_task.json +++ b/src/Components/HMI/ui/public/js/HMI_API_onboarding_task.json @@ -398,7 +398,7 @@ }, { "Bird": "Corcorax melanorhamphos", - "description": [] + "description": ["The white-winged chough is one of only two surviving species of the Australian mud-nest builders family, Corcoracidae, and is the only member of the genus"] }, { "Bird": "Cormobates leucophaea", @@ -410,7 +410,7 @@ }, { "Bird": "Corvus coronoides", - "description": [] + "description": ["The Australian raven (Corvus coronoides) is a passerine corvid bird native to Australia. Measuring 46–53 centimetres (18–21 in) in length, it has an all-black plumage, beak and mouth, as well as strong, greyish-black legs and feet. The upperparts of its body are glossy, with a purple-blue, greenish sheen; its black feathers have grey bases. The Australian raven is distinguished from the Australian crow, and other related corvids, by its long chest feathers, or throat hackles, which are prominent in mature birds."] }, { "Bird": "Corvus mellori", @@ -837,7 +837,7 @@ }, { "Bird": "Coracina papuensis", - "description": [] + "description": ["The white-bellied cuckooshrike (Coracina papuensis) is a species of bird in the family Campephagidae. It is found in Australia, the Moluccas, New Guinea and the Solomon Islands."] }, { "Bird": "Corcorax melanorhamphos", diff --git a/src/Components/HMI/ui/public/login - Copy.html b/src/Components/HMI/ui/public/login - Copy.html new file mode 100644 index 000000000..183d8c21c --- /dev/null +++ b/src/Components/HMI/ui/public/login - Copy.html @@ -0,0 +1,465 @@ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ + + +