From a805dd1e2ebb8cdce83407d0d01809324e439f74 Mon Sep 17 00:00:00 2001
From: Parina Bhardwaj <parinabhardwaj1206@gmail.com>
Date: Mon, 25 May 2026 15:34:12 +0530
Subject: [PATCH] K clustering added

---
 data/clusters.json          |  51 ++++++++
 data/projects.json          | 231 ++++++++++++++++++++++++++++++++----
 scripts/cluster_projects.py | 213 +++++++++++++++++++++++++++++++++
 test_recommender.py         | 181 ++++++++++++++++++++++++++++
 utils/recommender.py        | 194 +++++++++++++++++++++---------
 5 files changed, 787 insertions(+), 83 deletions(-)
 create mode 100644 data/clusters.json
 create mode 100644 scripts/cluster_projects.py
 create mode 100644 test_recommender.py

diff --git a/data/clusters.json b/data/clusters.json
new file mode 100644
index 00000000..01e42dcf
--- /dev/null
+++ b/data/clusters.json
@@ -0,0 +1,51 @@
+{
+  "k": 3,
+  "clusters": {
+    "1": 0,
+    "2": 1,
+    "3": 0,
+    "4": 2,
+    "5": 1,
+    "6": 2,
+    "7": 2,
+    "8": 0,
+    "9": 1,
+    "10": 2,
+    "11": 0,
+    "12": 0,
+    "13": 1,
+    "14": 0,
+    "15": 1,
+    "16": 2,
+    "17": 2,
+    "18": 1,
+    "19": 2
+  },
+  "members": {
+    "0": [
+      1,
+      3,
+      8,
+      11,
+      12,
+      14
+    ],
+    "1": [
+      2,
+      5,
+      9,
+      13,
+      15,
+      18
+    ],
+    "2": [
+      4,
+      6,
+      7,
+      10,
+      16,
+      17,
+      19
+    ]
+  }
+}
\ No newline at end of file
diff --git a/data/projects.json b/data/projects.json
index 664ec081..91d41984 100644
--- a/data/projects.json
+++ b/data/projects.json
@@ -275,31 +275,31 @@
     "starter_code": "starter_code/survey_form/index.html"
   },
   {
-    "id":10,
+    "id": 10,
     "title": "API ETL Pipeline",
-    "skills": ["Python","pandas","requests"],
+    "skills": ["Python", "pandas", "requests"],
     "level": "Intermediate",
     "interest": "Data",
     "time": "Medium",
     "description": "Enter a public API URL to fetch data and automatically transform it into a structured CSV dataset.",
     "features": [
       "Fetch data from public APIs",
-      "handle missing values",
+      "Handle missing values",
       "Normalize nested JSON",
       "Generate summary statistics",
-      "Export the processed CSV for any other Analytics projects"
+      "Export the processed CSV for any other analytics projects"
     ],
-    "tech_stack": ["Python", "pandas","requests","JSON"],
+    "tech_stack": ["Python", "pandas", "requests", "JSON"],
     "roadmap": [
-      "Step 1:  Install required modules via pip",
-      "Step 2:  Find a public API key for this project",
-      "Step 3:  Fetch the data from the API using requests",
-      "Step 4:  Validate the response you just fetched From the API",
-      "Step 5:  Normalize the nested JSON data by flattening it",
-      "Step 6:  Use the fetched data to build a pandas dataframe",
-      "Step 7:  Handle missing values or duplicate values",
-      "Step 8:  Export the cleaned dataset to CSV format",
-      "Step 9:  Generate a summary for the newly created CSV dataset",
+      "Step 1: Install required modules via pip",
+      "Step 2: Find a public API key for this project",
+      "Step 3: Fetch the data from the API using requests",
+      "Step 4: Validate the response you just fetched from the API",
+      "Step 5: Normalize the nested JSON data by flattening it",
+      "Step 6: Use the fetched data to build a pandas dataframe",
+      "Step 7: Handle missing values or duplicate values",
+      "Step 8: Export the cleaned dataset to CSV format",
+      "Step 9: Generate a summary for the newly created CSV dataset",
       "Step 10: Test the file with at least two different public APIs"
     ],
     "resources": [
@@ -310,11 +310,9 @@
       "Real Python API guide: https://realpython.com/api-integration-in-python/"
     ],
     "starter_code": "starter_code/api_data_pipeline.py"
-  }
-
-  ,
+  },
   {
-    "id": 8,
+    "id": 11,
     "title": "Number Guessing Game",
     "skills": ["Python"],
     "level": "Beginner",
@@ -345,7 +343,7 @@
     "starter_code": "starter_code/number_guessing.py"
   },
   {
-    "id": 9,
+    "id": 12,
     "title": "Simple Email Automation",
     "skills": ["Python"],
     "level": "Beginner",
@@ -376,7 +374,7 @@
     "starter_code": "starter_code/email_automation.py"
   },
   {
-    "id": 10,
+    "id": 13,
     "title": "Quiz App",
     "skills": ["HTML", "CSS", "JavaScript"],
     "level": "Beginner",
@@ -406,8 +404,193 @@
       "W3Schools JavaScript: https://www.w3schools.com/js"
     ],
     "starter_code": "starter_code/quiz_app.html"
+  },
+  {
+    "id": 14,
+    "title": "File Organiser Script",
+    "skills": ["Python"],
+    "level": "Beginner",
+    "interest": "Automation",
+    "time": "Low",
+    "description": "A Python script that scans a folder and automatically sorts files into subfolders by type — images, documents, videos, code files. Great for learning os and shutil modules.",
+    "features": [
+      "Detect file type by extension",
+      "Create subfolders automatically",
+      "Move files into the correct folder",
+      "Print a summary of what was moved"
+    ],
+    "tech_stack": ["Python", "os module", "shutil module"],
+    "roadmap": [
+      "Step 1: Import os and shutil",
+      "Step 2: Define a dictionary mapping extensions to folder names",
+      "Step 3: Loop through files in the target directory",
+      "Step 4: Check each file's extension",
+      "Step 5: Create the destination folder if it doesn't exist",
+      "Step 6: Move the file using shutil.move()",
+      "Step 7: Print a summary of moved files"
+    ],
+    "resources": [
+      "Python os module: https://docs.python.org/3/library/os.html",
+      "Python shutil module: https://docs.python.org/3/library/shutil.html",
+      "Real Python file handling: https://realpython.com/working-with-files-in-python"
+    ],
+    "starter_code": "starter_code/file_organiser.py"
+  },
+  {
+    "id": 15,
+    "title": "Flashcard Study App",
+    "skills": ["HTML", "CSS", "JavaScript"],
+    "level": "Beginner",
+    "interest": "Education",
+    "time": "Low",
+    "description": "A browser-based flashcard app where users can flip cards to reveal answers. Reinforces DOM manipulation, CSS transitions, and basic data storage in JavaScript.",
+    "features": [
+      "Flip card animation on click",
+      "Navigate between cards",
+      "Track how many cards reviewed",
+      "Shuffle deck order"
+    ],
+    "tech_stack": ["HTML", "CSS", "JavaScript"],
+    "roadmap": [
+      "Step 1: Create the card HTML structure with front and back faces",
+      "Step 2: Write CSS for the 3D flip animation",
+      "Step 3: Store flashcard data as a JavaScript array",
+      "Step 4: Render the current card from the array",
+      "Step 5: Add click handler to trigger the flip",
+      "Step 6: Add next/previous navigation buttons",
+      "Step 7: Implement the shuffle function"
+    ],
+    "resources": [
+      "CSS 3D transforms: https://developer.mozilla.org/en-US/docs/Web/CSS/transform",
+      "JavaScript arrays: https://javascript.info/array",
+      "W3Schools CSS: https://www.w3schools.com/css"
+    ],
+    "starter_code": "starter_code/flashcard_app.html"
+  },
+  {
+    "id": 16,
+    "title": "Budget Tracker Web App",
+    "skills": ["HTML", "CSS", "JavaScript"],
+    "level": "Intermediate",
+    "interest": "Data",
+    "time": "Medium",
+    "description": "A browser-based personal finance tracker that lets users add income and expense entries and visualises the balance over time with a simple chart.",
+    "features": [
+      "Add income and expense entries",
+      "Show running balance",
+      "Colour-code entries by type",
+      "Render a bar chart of monthly totals"
+    ],
+    "tech_stack": ["HTML", "CSS", "JavaScript", "Chart.js"],
+    "roadmap": [
+      "Step 1: Build the HTML form for adding entries",
+      "Step 2: Store entries in a JavaScript array",
+      "Step 3: Render the entry list dynamically",
+      "Step 4: Calculate and display the running balance",
+      "Step 5: Group entries by month for chart data",
+      "Step 6: Import Chart.js via CDN",
+      "Step 7: Render a bar chart using the monthly totals",
+      "Step 8: Add delete functionality for individual entries"
+    ],
+    "resources": [
+      "Chart.js docs: https://www.chartjs.org/docs/latest",
+      "MDN DOM: https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model",
+      "JavaScript arrays: https://javascript.info/array"
+    ],
+    "starter_code": "starter_code/budget_tracker.html"
+  },
+  {
+    "id": 17,
+    "title": "Network Port Scanner",
+    "skills": ["Python"],
+    "level": "Intermediate",
+    "interest": "Cybersecurity",
+    "time": "Medium",
+    "description": "A Python tool that scans a target host for open ports within a given range. Teaches socket programming, threading for speed, and basic network concepts.",
+    "features": [
+      "Accept host and port range as input",
+      "Check each port using sockets",
+      "Display open ports with service names",
+      "Use threading to speed up scanning"
+    ],
+    "tech_stack": ["Python", "socket module", "threading module"],
+    "roadmap": [
+      "Step 1: Import socket and threading modules",
+      "Step 2: Write a function to test a single port",
+      "Step 3: Loop through the port range and test each",
+      "Step 4: Add threading to run scans concurrently",
+      "Step 5: Map common ports to service names",
+      "Step 6: Display results sorted by port number",
+      "Step 7: Add input validation for host and port range"
+    ],
+    "resources": [
+      "Python socket docs: https://docs.python.org/3/library/socket.html",
+      "Python threading: https://docs.python.org/3/library/threading.html",
+      "OWASP testing guide: https://owasp.org/www-project-web-security-testing-guide"
+    ],
+    "starter_code": "starter_code/port_scanner.py"
+  },
+  {
+    "id": 18,
+    "title": "Typing Speed Test",
+    "skills": ["HTML", "CSS", "JavaScript"],
+    "level": "Beginner",
+    "interest": "Games",
+    "time": "Medium",
+    "description": "A browser-based typing test that measures words per minute and accuracy. Great for practising timers, string comparison, and dynamic DOM updates.",
+    "features": [
+      "Display a random passage to type",
+      "Start timer on first keypress",
+      "Highlight correct and incorrect characters in real time",
+      "Show WPM and accuracy on completion"
+    ],
+    "tech_stack": ["HTML", "CSS", "JavaScript"],
+    "roadmap": [
+      "Step 1: Store a list of sample passages",
+      "Step 2: Display a random passage in the UI",
+      "Step 3: Listen for keypress events in the input field",
+      "Step 4: Start the timer on the first keypress",
+      "Step 5: Compare typed characters to the passage character by character",
+      "Step 6: Highlight correct characters green and errors red",
+      "Step 7: Stop the timer when the passage is complete",
+      "Step 8: Calculate and display WPM and accuracy"
+    ],
+    "resources": [
+      "JavaScript timers: https://developer.mozilla.org/en-US/docs/Web/API/setInterval",
+      "JavaScript string methods: https://javascript.info/string",
+      "MDN keyboard events: https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent"
+    ],
+    "starter_code": "starter_code/typing_test.html"
+  },
+  {
+    "id": 19,
+    "title": "Course Progress Tracker",
+    "skills": ["Python"],
+    "level": "Intermediate",
+    "interest": "Education",
+    "time": "Medium",
+    "description": "A CLI tool to track progress through online courses. Users can add courses, mark lessons complete, and see a visual progress bar per course.",
+    "features": [
+      "Add courses with a total lesson count",
+      "Mark individual lessons as complete",
+      "Display a text progress bar per course",
+      "Save and load state from a JSON file"
+    ],
+    "tech_stack": ["Python", "json module", "os module"],
+    "roadmap": [
+      "Step 1: Define the course data structure",
+      "Step 2: Write add_course() and add_lesson() functions",
+      "Step 3: Implement mark_complete() logic",
+      "Step 4: Build a text progress bar renderer",
+      "Step 5: Write JSON save and load functions",
+      "Step 6: Create a menu loop for user interaction",
+      "Step 7: Display all courses with progress on startup"
+    ],
+    "resources": [
+      "Python JSON module: https://docs.python.org/3/library/json.html",
+      "Real Python CLI apps: https://realpython.com/command-line-interfaces-python-argparse",
+      "Python os module: https://docs.python.org/3/library/os.html"
+    ],
+    "starter_code": "starter_code/course_tracker.py"
   }
-]
-
-
-
+]
\ No newline at end of file
diff --git a/scripts/cluster_projects.py b/scripts/cluster_projects.py
new file mode 100644
index 00000000..01fd3e88
--- /dev/null
+++ b/scripts/cluster_projects.py
@@ -0,0 +1,213 @@
+"""
+scripts/cluster_projects.py
+
+Precomputes K-Means cluster assignments for all projects in data/projects.json
+and writes the result to data/clusters.json.
+
+Run this script whenever projects.json changes:
+    python scripts/cluster_projects.py
+
+Requirements:
+    pip install scikit-learn
+
+Output format (data/clusters.json):
+    {
+        "k": 4,
+        "clusters": {
+            "1": 0,
+            "2": 2,
+            ...
+        },
+        "members": {
+            "0": [1, 7, 10],
+            "1": [3, 9, 19],
+            ...
+        }
+    }
+"""
+
+import json
+import math
+import os
+import sys
+
+# ---------------------------------------------------------------------------
+# Try importing scikit-learn. Give a clear message if it is not installed.
+# ---------------------------------------------------------------------------
+try:
+    from sklearn.cluster import KMeans
+    from sklearn.preprocessing import MultiLabelBinarizer
+except ImportError:
+    sys.exit(
+        "scikit-learn is required. Install it with:  pip install scikit-learn"
+    )
+
+# ---------------------------------------------------------------------------
+# Paths — works whether you run the script from the repo root or from scripts/
+# ---------------------------------------------------------------------------
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+REPO_ROOT = os.path.dirname(SCRIPT_DIR)
+PROJECTS_PATH = os.path.join(REPO_ROOT, "data", "projects.json")
+CLUSTERS_PATH = os.path.join(REPO_ROOT, "data", "clusters.json")
+
+# ---------------------------------------------------------------------------
+# Minimum projects needed before clustering makes sense.
+# Below this threshold the script exits with a clear explanation.
+# ---------------------------------------------------------------------------
+MIN_PROJECTS = 10
+
+
+def load_projects(path: str) -> list[dict]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def choose_k(n: int) -> int:
+    """
+    Pick a sensible number of clusters for n projects.
+
+    Rule: k = max(2, round(sqrt(n / 2)))
+
+    Examples:
+        10 projects  -> k = 2
+        18 projects  -> k = 3
+        32 projects  -> k = 4
+        50 projects  -> k = 5
+
+    This keeps clusters from being either too broad (k=2 for 50 projects)
+    or too granular (k=10 for 10 projects).
+    """
+    return max(2, round(math.sqrt(n / 2)))
+
+
+def vectorise(projects: list[dict]):
+    """
+    Convert categorical project attributes into a numeric matrix.
+
+    Each project becomes a binary vector with one dimension per unique value
+    of: skills, level, interest, and time.
+
+    Example row for a project with skills=["Python"], level="Beginner",
+    interest="Data", time="Low":
+
+        [Python=1, HTML=0, ..., Beginner=1, Intermediate=0, ..., Data=1, ...]
+
+    Returns:
+        X      -- numpy array of shape (n_projects, n_features)
+        labels -- list of column names (for debugging)
+    """
+    mlb_skills = MultiLabelBinarizer()
+    mlb_level = MultiLabelBinarizer()
+    mlb_interest = MultiLabelBinarizer()
+    mlb_time = MultiLabelBinarizer()
+
+    skills_matrix = mlb_skills.fit_transform(
+        [p["skills"] for p in projects]
+    )
+    level_matrix = mlb_level.fit_transform(
+        [[p["level"]] for p in projects]
+    )
+    interest_matrix = mlb_interest.fit_transform(
+        [[p["interest"]] for p in projects]
+    )
+    time_matrix = mlb_time.fit_transform(
+        [[p["time"]] for p in projects]
+    )
+
+    import numpy as np
+    X = np.hstack([skills_matrix, level_matrix, interest_matrix, time_matrix])
+
+    labels = (
+        list(mlb_skills.classes_)
+        + list(mlb_level.classes_)
+        + list(mlb_interest.classes_)
+        + list(mlb_time.classes_)
+    )
+
+    return X, labels
+
+
+def run_clustering(projects: list[dict], k: int) -> dict:
+    """
+    Run K-Means and return the cluster assignments as a dict.
+
+    Returns a dict with three keys:
+        k        -- the number of clusters used
+        clusters -- {project_id: cluster_id, ...}
+        members  -- {cluster_id: [project_id, ...], ...}
+    """
+    X, feature_labels = vectorise(projects)
+
+    km = KMeans(
+        n_clusters=k,
+        n_init=20,       # run 20 times and keep the best result
+        random_state=42, # reproducible output
+    )
+    km.fit(X)
+
+    clusters: dict[str, int] = {}
+    members: dict[str, list] = {str(i): [] for i in range(k)}
+
+    for project, cluster_id in zip(projects, km.labels_):
+        pid = str(project["id"])
+        cid = int(cluster_id)
+        clusters[pid] = cid
+        members[str(cid)].append(project["id"])
+
+    return {
+        "k": k,
+        "clusters": clusters,
+        "members": members,
+    }
+
+
+def print_summary(result: dict, projects: list[dict]) -> None:
+    """Print a human-readable summary of the clustering result."""
+    id_to_title = {str(p["id"]): p["title"] for p in projects}
+    print(f"\nClustered {len(result['clusters'])} projects into {result['k']} groups.\n")
+    for cid, member_ids in result["members"].items():
+        print(f"  Cluster {cid} ({len(member_ids)} projects):")
+        for pid in member_ids:
+            print(f"    - [{pid}] {id_to_title.get(str(pid), '?')}")
+    print()
+
+
+def main():
+    # ------------------------------------------------------------------
+    # 1. Load projects
+    # ------------------------------------------------------------------
+    if not os.path.exists(PROJECTS_PATH):
+        sys.exit(f"projects.json not found at: {PROJECTS_PATH}")
+
+    projects = load_projects(PROJECTS_PATH)
+
+    # ------------------------------------------------------------------
+    # 2. Guard: need enough projects for clustering to be meaningful
+    # ------------------------------------------------------------------
+    if len(projects) < MIN_PROJECTS:
+        sys.exit(
+            f"Only {len(projects)} project(s) found. "
+            f"Clustering requires at least {MIN_PROJECTS}. "
+            f"Add more projects to data/projects.json first."
+        )
+
+    # ------------------------------------------------------------------
+    # 3. Choose k and run clustering
+    # ------------------------------------------------------------------
+    k = choose_k(len(projects))
+    print(f"Found {len(projects)} projects. Using k={k} clusters.")
+
+    result = run_clustering(projects, k)
+    print_summary(result, projects)
+
+    # ------------------------------------------------------------------
+    # 4. Write output
+    # ------------------------------------------------------------------
+    with open(CLUSTERS_PATH, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2)
+
+    print(f"Cluster assignments written to: {CLUSTERS_PATH}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test_recommender.py b/test_recommender.py
new file mode 100644
index 00000000..b25f85b5
--- /dev/null
+++ b/test_recommender.py
@@ -0,0 +1,181 @@
+# test_recommender.py
+# Run from the repo root with: python test_recommender.py
+
+import sys
+import os
+
+# Make sure imports resolve from the repo root regardless of where Python
+# looks by default.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from utils.recommender import (
+    get_recommendations,
+    validate_recommendation_inputs,
+    _get_related,
+    _load_clusters,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def passed(label):
+    print(f"  PASS  {label}")
+
+def failed(label, detail):
+    print(f"  FAIL  {label}")
+    print(f"        {detail}")
+
+def section(title):
+    print(f"\n{title}")
+    print("-" * len(title))
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+section("Input validation")
+
+errors = validate_recommendation_inputs("", "Beginner", "Data", "Low")
+if errors:
+    passed("empty skills caught")
+else:
+    failed("empty skills caught", "expected an error, got none")
+
+errors = validate_recommendation_inputs("Python", "", "Data", "Low")
+if errors:
+    passed("empty level caught")
+else:
+    failed("empty level caught", "expected an error, got none")
+
+errors = validate_recommendation_inputs("Python", "Beginner", "Data", "Low")
+if not errors:
+    passed("valid inputs pass through cleanly")
+else:
+    failed("valid inputs pass through cleanly", f"unexpected errors: {errors}")
+
+# ---------------------------------------------------------------------------
+# Return shape
+# ---------------------------------------------------------------------------
+
+section("Return shape")
+
+result = get_recommendations("Python", "Beginner", "Data", "Low")
+
+if isinstance(result, dict):
+    passed("get_recommendations returns a dict")
+else:
+    failed("get_recommendations returns a dict", f"got {type(result)}")
+
+if "recommendations" in result:
+    passed("dict has 'recommendations' key")
+else:
+    failed("dict has 'recommendations' key", f"keys found: {list(result.keys())}")
+
+if "related" in result:
+    passed("dict has 'related' key")
+else:
+    failed("dict has 'related' key", f"keys found: {list(result.keys())}")
+
+# ---------------------------------------------------------------------------
+# Recommendations list
+# ---------------------------------------------------------------------------
+
+section("Recommendations")
+
+recs = result["recommendations"]
+
+if isinstance(recs, list):
+    passed(f"recommendations is a list  ({len(recs)} result(s))")
+else:
+    failed("recommendations is a list", f"got {type(recs)}")
+
+if len(recs) <= 3:
+    passed(f"respects MAX_RESULTS cap  (got {len(recs)})")
+else:
+    failed("respects MAX_RESULTS cap", f"got {len(recs)} results")
+
+required_fields = {"id", "title", "skills", "level", "interest", "time"}
+all_valid = all(required_fields.issubset(p.keys()) for p in recs)
+if all_valid:
+    passed("all results have required fields")
+else:
+    failed("all results have required fields", "one or more fields missing")
+
+# High time should return >= results as Low (it opens up more projects)
+high_recs = get_recommendations("Python", "Beginner", "Data", "High")["recommendations"]
+low_recs  = get_recommendations("Python", "Beginner", "Data", "Low")["recommendations"]
+if len(high_recs) >= len(low_recs):
+    passed("High time availability returns >= results than Low")
+else:
+    failed("High time availability returns >= results than Low",
+           f"High={len(high_recs)}, Low={len(low_recs)}")
+
+# Nonsense input should return empty recommendations, not crash
+junk = get_recommendations("cobol_fortran_brainfuck", "Expert", "Knitting", "Low")["recommendations"]
+if isinstance(junk, list) and len(junk) == 0:
+    passed("no-match input returns empty recommendations")
+else:
+    failed("no-match input returns empty recommendations", f"got: {junk}")
+
+# ---------------------------------------------------------------------------
+# Skill alias normalisation
+# ---------------------------------------------------------------------------
+
+section("Skill alias normalisation")
+
+js_results   = get_recommendations("js",         "Beginner", "Web", "Low")["recommendations"]
+full_results = get_recommendations("javascript", "Beginner", "Web", "Low")["recommendations"]
+if js_results == full_results:
+    passed("'js' alias resolves to 'javascript'")
+else:
+    failed("'js' alias resolves to 'javascript'",
+           f"js={[p['title'] for p in js_results]}, "
+           f"javascript={[p['title'] for p in full_results]}")
+
+# ---------------------------------------------------------------------------
+# Related projects (soft — skipped if clusters.json missing)
+# ---------------------------------------------------------------------------
+
+section("Related projects (requires clusters.json)")
+
+clusters_path = os.path.join("data", "clusters.json")
+
+if not os.path.exists(clusters_path):
+    print("  SKIP  clusters.json not found — run:  python scripts/cluster_projects.py")
+else:
+    cluster_data = _load_clusters()
+    all_projects = __import__(
+        "utils.data_loader", fromlist=["load_all_projects"]
+    ).load_all_projects()
+
+    rec_result = get_recommendations("Python", "Beginner", "Data", "Low")
+    recs       = rec_result["recommendations"]
+    related    = rec_result["related"]
+
+    if isinstance(related, list):
+        passed(f"related is a list  ({len(related)} result(s))")
+    else:
+        failed("related is a list", f"got {type(related)}")
+
+    if len(related) <= 3:
+        passed(f"related respects MAX_RELATED cap  (got {len(related)})")
+    else:
+        failed("related respects MAX_RELATED cap", f"got {len(related)}")
+
+    if recs:
+        rec_ids = [p["id"] for p in recs]
+        overlap = [p for p in related if p["id"] in rec_ids]
+        if not overlap:
+            passed("related projects don't repeat recommended ones")
+        else:
+            failed("related projects don't repeat recommended ones",
+                   f"overlap: {[p['title'] for p in overlap]}")
+    else:
+        print("  SKIP  no recommendations returned, skipping overlap check")
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+
+print("\nDone.\n")
\ No newline at end of file
diff --git a/utils/recommender.py b/utils/recommender.py
index c5a4f218..6170b976 100644
--- a/utils/recommender.py
+++ b/utils/recommender.py
@@ -1,12 +1,18 @@
 # utils/recommender.py
-# Contains all recommendation logic: scoring and filtering projects.
+# Contains all recommendation logic: scoring, filtering, and related projects.
 # Kept separate from routing so it can be tested and extended independently.
 
+import json
+import os
+
 from utils.data_loader import load_all_projects
 
 # Maximum number of recommendations returned to the user
 MAX_RESULTS = 3
 
+# Maximum number of "you might also like" projects returned alongside results
+MAX_RELATED = 3
+
 # Scoring weights used by the recommendation engine.
 # Higher weights mean that criterion has more influence
 # on the final recommendation score.
@@ -17,18 +23,29 @@
     "time":     1,
 }
 
-
-# Common aliases and abbreviations for skills
-# This improves recommendation accuracy by normalizing user input
+# Common aliases and abbreviations for skills.
+# This improves recommendation accuracy by normalizing user input.
 SKILL_ALIASES = {
-    "js": "javascript",
-    "py": "python",
-    "html5": "html",
-    "css3": "css",
-    "c++": "cpp",
-    "web dev": "javascript"
+    "js":      "javascript",
+    "py":      "python",
+    "html5":   "html",
+    "css3":    "css",
+    "c++":     "cpp",
+    "web dev": "javascript",
 }
 
+# Path to the precomputed cluster assignments.
+# Generated by: python scripts/cluster_projects.py
+_CLUSTERS_PATH = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+    "data",
+    "clusters.json",
+)
+
+
+# ---------------------------------------------------------------------------
+# Skill parsing
+# ---------------------------------------------------------------------------
 
 def parse_skills(skills_string):
     """
@@ -36,99 +53,158 @@ def parse_skills(skills_string):
     a normalized lowercase list.
 
     Example:
-    "JS, HTML5, CSS3" -> ["javascript", "html", "css"]
+        "JS, HTML5, CSS3" -> ["javascript", "html", "css"]
     """
-
     raw_skills = [
         s.strip().lower()
         for s in skills_string.split(",")
         if s.strip()
     ]
+    return [SKILL_ALIASES.get(skill, skill) for skill in raw_skills]
 
-    normalized_skills = [
-        SKILL_ALIASES.get(skill, skill)
-        for skill in raw_skills
-    ]
-
-    return normalized_skills
 
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
 
-def score_single_project(
-        project, user_skills,
-        level, interest, time_availability):
+def score_single_project(project, user_skills, level, interest, time_availability):
     """
     Calculate a numeric relevance score for one project.
 
-    Each matching criterion adds points:
+    Scoring rules:
       - Each matching skill:  +3
       - Level match:          +2
       - Interest match:       +2
       - Time match:           +1
 
-    Returns an integer score (0 means no match at all).
+    Time filtering: projects that require MORE time than the user has
+    available are excluded entirely (score returned as 0).
+
+    Returns an integer score (0 means no match or time mismatch).
     """
-    # Compare time availability, return results with the same time availibity or lower.
-    TIME_AVAILABILITY = ['low', 'medium', 'high']
-    time_availability_index =   TIME_AVAILABILITY.index(time_availability.strip().lower())
-    valid_time = TIME_AVAILABILITY[ : time_availability_index + 1 ]
-    
+    TIME_RANKS = ["low", "medium", "high"]
+
+    user_time    = time_availability.strip().lower()
+    project_time = project.get("time", "").strip().lower()
+
+    # If the project needs more time than the user has, exclude it.
+    if project_time not in TIME_RANKS or user_time not in TIME_RANKS:
+        return 0
+    if TIME_RANKS.index(project_time) > TIME_RANKS.index(user_time):
+        return 0
+
     score = 0
 
-    # Compare user's skills against the project's required skills
+    # Skills: count how many of the user's skills the project requires.
     project_skills = [s.lower() for s in project.get("skills", [])]
-    # Count how many user skills overlap with the
-    # skills required by the current project.
     matched_skills = sum(1 for skill in user_skills if skill in project_skills)
-    # Add weighted points based on the number of matching skills.
-    # More overlapping skills result in a higher recommendation score.
     score += matched_skills * SCORING_WEIGHTS["skill"]
 
-    # Award points for each additional matching criterion
     if project.get("level", "").lower() == level.lower():
         score += SCORING_WEIGHTS["level"]
 
     if project.get("interest", "").lower() == interest.lower():
         score += SCORING_WEIGHTS["interest"]
 
-    if project.get("time", "").lower() == time_availability.lower():
+    if project_time == user_time:
         score += SCORING_WEIGHTS["time"]
 
-    if project.get("time", "").lower() in valid_time :
-        return score
-    return 0
+    return score
+
+
+# ---------------------------------------------------------------------------
+# Clustering helpers
+# ---------------------------------------------------------------------------
+
+def _load_clusters():
+    """
+    Load clusters.json if it exists.
+
+    Returns the parsed dict, or None if the file is missing or unreadable.
+    A missing file is a soft failure — the recommender still works,
+    it just won't return related projects.
+    """
+    if not os.path.exists(_CLUSTERS_PATH):
+        return None
+    try:
+        with open(_CLUSTERS_PATH, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return None
+
+
+def _get_related(recommended_ids, all_projects, cluster_data):
+    """
+    Find projects in the same cluster(s) as the recommended projects,
+    excluding the ones already recommended.
+
+    Returns up to MAX_RELATED project dicts.
+    """
+    clusters = cluster_data.get("clusters", {})  # {str(pid): cid}
+    members  = cluster_data.get("members",  {})  # {str(cid): [pid, ...]}
+
+    # Collect which clusters the recommended projects belong to.
+    relevant_cluster_ids = set()
+    for pid in recommended_ids:
+        cid = clusters.get(str(pid))
+        if cid is not None:
+            relevant_cluster_ids.add(str(cid))
 
+    if not relevant_cluster_ids:
+        return []
+
+    # Gather candidate IDs from those clusters, excluding already recommended.
+    candidate_ids = []
+    for cid in relevant_cluster_ids:
+        for pid in members.get(cid, []):
+            if pid not in recommended_ids and pid not in candidate_ids:
+                candidate_ids.append(pid)
+
+    id_to_project = {p["id"]: p for p in all_projects}
+    related = [id_to_project[pid] for pid in candidate_ids if pid in id_to_project]
+    return related[:MAX_RELATED]
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
 
 def get_recommendations(skills_string, level, interest, time_availability):
     """
-    Return the top N recommended projects for the given user inputs.
-
-    Steps:
-      1. Parse the raw skills input into a list.
-      2. Score every project in the dataset.
-      3. Drop projects with a score of zero (no overlap at all).
-      4. Sort by score descending.
-      5. Return the top MAX_RESULTS projects.
+    Return the top N recommended projects for the given user inputs,
+    along with related projects from the same cluster.
+
+    Return shape:
+        {
+            "recommendations": [ <project>, ... ],  # up to MAX_RESULTS
+            "related":         [ <project>, ... ],  # up to MAX_RELATED
+        }
+
+    The "related" list is empty when clusters.json does not exist yet.
+    Run scripts/cluster_projects.py to generate it.
     """
-    user_skills = parse_skills(skills_string)
+    user_skills  = parse_skills(skills_string)
     all_projects = load_all_projects()
 
-    scored_projects = []
-
+    scored = []
     for project in all_projects:
         score = score_single_project(
             project, user_skills, level, interest, time_availability
         )
-        # Ignore projects with a score of 0 since they
-        # have no meaningful overlap with the user's inputs.
-        if score > 0:
-            scored_projects.append({"project": project, "score": score})
+        if score >= SCORING_WEIGHTS["skill"]:
+            scored.append({"project": project, "score": score})
+
+    scored.sort(key=lambda item: item["score"], reverse=True)
+    top_projects = [item["project"] for item in scored[:MAX_RESULTS]]
+    top_ids      = [p["id"] for p in top_projects]
 
-    # Sort projects in descending order so the
-    # most relevant recommendations appear first.
-    scored_projects.sort(key=lambda item: item["score"], reverse=True)
+    cluster_data = _load_clusters()
+    related = _get_related(top_ids, all_projects, cluster_data) if cluster_data else []
 
-    # Return only the project dicts, not the score metadata
-    return [item["project"] for item in scored_projects[:MAX_RESULTS]]
+    return {
+        "recommendations": top_projects,
+        "related":         related,
+    }
 
 
 def validate_recommendation_inputs(skills, level, interest, time_availability):
@@ -150,4 +226,4 @@ def validate_recommendation_inputs(skills, level, interest, time_availability):
     if not time_availability or not time_availability.strip():
         errors.append("Please select your time availability.")
 
-    return errors
+    return errors
\ No newline at end of file