From a805dd1e2ebb8cdce83407d0d01809324e439f74 Mon Sep 17 00:00:00 2001 From: Parina Bhardwaj Date: Mon, 25 May 2026 15:34:12 +0530 Subject: [PATCH] K clustering added --- data/clusters.json | 51 ++++++++ data/projects.json | 231 ++++++++++++++++++++++++++++++++---- scripts/cluster_projects.py | 213 +++++++++++++++++++++++++++++++++ test_recommender.py | 181 ++++++++++++++++++++++++++++ utils/recommender.py | 194 +++++++++++++++++++++--------- 5 files changed, 787 insertions(+), 83 deletions(-) create mode 100644 data/clusters.json create mode 100644 scripts/cluster_projects.py create mode 100644 test_recommender.py diff --git a/data/clusters.json b/data/clusters.json new file mode 100644 index 00000000..01e42dcf --- /dev/null +++ b/data/clusters.json @@ -0,0 +1,51 @@ +{ + "k": 3, + "clusters": { + "1": 0, + "2": 1, + "3": 0, + "4": 2, + "5": 1, + "6": 2, + "7": 2, + "8": 0, + "9": 1, + "10": 2, + "11": 0, + "12": 0, + "13": 1, + "14": 0, + "15": 1, + "16": 2, + "17": 2, + "18": 1, + "19": 2 + }, + "members": { + "0": [ + 1, + 3, + 8, + 11, + 12, + 14 + ], + "1": [ + 2, + 5, + 9, + 13, + 15, + 18 + ], + "2": [ + 4, + 6, + 7, + 10, + 16, + 17, + 19 + ] + } +} \ No newline at end of file diff --git a/data/projects.json b/data/projects.json index 664ec081..91d41984 100644 --- a/data/projects.json +++ b/data/projects.json @@ -275,31 +275,31 @@ "starter_code": "starter_code/survey_form/index.html" }, { - "id":10, + "id": 10, "title": "API ETL Pipeline", - "skills": ["Python","pandas","requests"], + "skills": ["Python", "pandas", "requests"], "level": "Intermediate", "interest": "Data", "time": "Medium", "description": "Enter a public API URL to fetch data and automatically transform it into a structured CSV dataset.", "features": [ "Fetch data from public APIs", - "handle missing values", + "Handle missing values", "Normalize nested JSON", "Generate summary statistics", - "Export the processed CSV for any other Analytics projects" + "Export the processed CSV for any other analytics projects" ], - "tech_stack": ["Python", "pandas","requests","JSON"], + "tech_stack": ["Python", "pandas", "requests", "JSON"], "roadmap": [ - "Step 1: Install required modules via pip", - "Step 2: Find a public API key for this project", - "Step 3: Fetch the data from the API using requests", - "Step 4: Validate the response you just fetched From the API", - "Step 5: Normalize the nested JSON data by flattening it", - "Step 6: Use the fetched data to build a pandas dataframe", - "Step 7: Handle missing values or duplicate values", - "Step 8: Export the cleaned dataset to CSV format", - "Step 9: Generate a summary for the newly created CSV dataset", + "Step 1: Install required modules via pip", + "Step 2: Find a public API key for this project", + "Step 3: Fetch the data from the API using requests", + "Step 4: Validate the response you just fetched from the API", + "Step 5: Normalize the nested JSON data by flattening it", + "Step 6: Use the fetched data to build a pandas dataframe", + "Step 7: Handle missing values or duplicate values", + "Step 8: Export the cleaned dataset to CSV format", + "Step 9: Generate a summary for the newly created CSV dataset", "Step 10: Test the file with at least two different public APIs" ], "resources": [ @@ -310,11 +310,9 @@ "Real Python API guide: https://realpython.com/api-integration-in-python/" ], "starter_code": "starter_code/api_data_pipeline.py" - } - - , + }, { - "id": 8, + "id": 11, "title": "Number Guessing Game", "skills": ["Python"], "level": "Beginner", @@ -345,7 +343,7 @@ "starter_code": "starter_code/number_guessing.py" }, { - "id": 9, + "id": 12, "title": "Simple Email Automation", "skills": ["Python"], "level": "Beginner", @@ -376,7 +374,7 @@ "starter_code": "starter_code/email_automation.py" }, { - "id": 10, + "id": 13, "title": "Quiz App", "skills": ["HTML", "CSS", "JavaScript"], "level": "Beginner", @@ -406,8 +404,193 @@ "W3Schools JavaScript: https://www.w3schools.com/js" ], "starter_code": "starter_code/quiz_app.html" + }, + { + "id": 14, + "title": "File Organiser Script", + "skills": ["Python"], + "level": "Beginner", + "interest": "Automation", + "time": "Low", + "description": "A Python script that scans a folder and automatically sorts files into subfolders by type — images, documents, videos, code files. Great for learning os and shutil modules.", + "features": [ + "Detect file type by extension", + "Create subfolders automatically", + "Move files into the correct folder", + "Print a summary of what was moved" + ], + "tech_stack": ["Python", "os module", "shutil module"], + "roadmap": [ + "Step 1: Import os and shutil", + "Step 2: Define a dictionary mapping extensions to folder names", + "Step 3: Loop through files in the target directory", + "Step 4: Check each file's extension", + "Step 5: Create the destination folder if it doesn't exist", + "Step 6: Move the file using shutil.move()", + "Step 7: Print a summary of moved files" + ], + "resources": [ + "Python os module: https://docs.python.org/3/library/os.html", + "Python shutil module: https://docs.python.org/3/library/shutil.html", + "Real Python file handling: https://realpython.com/working-with-files-in-python" + ], + "starter_code": "starter_code/file_organiser.py" + }, + { + "id": 15, + "title": "Flashcard Study App", + "skills": ["HTML", "CSS", "JavaScript"], + "level": "Beginner", + "interest": "Education", + "time": "Low", + "description": "A browser-based flashcard app where users can flip cards to reveal answers. Reinforces DOM manipulation, CSS transitions, and basic data storage in JavaScript.", + "features": [ + "Flip card animation on click", + "Navigate between cards", + "Track how many cards reviewed", + "Shuffle deck order" + ], + "tech_stack": ["HTML", "CSS", "JavaScript"], + "roadmap": [ + "Step 1: Create the card HTML structure with front and back faces", + "Step 2: Write CSS for the 3D flip animation", + "Step 3: Store flashcard data as a JavaScript array", + "Step 4: Render the current card from the array", + "Step 5: Add click handler to trigger the flip", + "Step 6: Add next/previous navigation buttons", + "Step 7: Implement the shuffle function" + ], + "resources": [ + "CSS 3D transforms: https://developer.mozilla.org/en-US/docs/Web/CSS/transform", + "JavaScript arrays: https://javascript.info/array", + "W3Schools CSS: https://www.w3schools.com/css" + ], + "starter_code": "starter_code/flashcard_app.html" + }, + { + "id": 16, + "title": "Budget Tracker Web App", + "skills": ["HTML", "CSS", "JavaScript"], + "level": "Intermediate", + "interest": "Data", + "time": "Medium", + "description": "A browser-based personal finance tracker that lets users add income and expense entries and visualises the balance over time with a simple chart.", + "features": [ + "Add income and expense entries", + "Show running balance", + "Colour-code entries by type", + "Render a bar chart of monthly totals" + ], + "tech_stack": ["HTML", "CSS", "JavaScript", "Chart.js"], + "roadmap": [ + "Step 1: Build the HTML form for adding entries", + "Step 2: Store entries in a JavaScript array", + "Step 3: Render the entry list dynamically", + "Step 4: Calculate and display the running balance", + "Step 5: Group entries by month for chart data", + "Step 6: Import Chart.js via CDN", + "Step 7: Render a bar chart using the monthly totals", + "Step 8: Add delete functionality for individual entries" + ], + "resources": [ + "Chart.js docs: https://www.chartjs.org/docs/latest", + "MDN DOM: https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model", + "JavaScript arrays: https://javascript.info/array" + ], + "starter_code": "starter_code/budget_tracker.html" + }, + { + "id": 17, + "title": "Network Port Scanner", + "skills": ["Python"], + "level": "Intermediate", + "interest": "Cybersecurity", + "time": "Medium", + "description": "A Python tool that scans a target host for open ports within a given range. Teaches socket programming, threading for speed, and basic network concepts.", + "features": [ + "Accept host and port range as input", + "Check each port using sockets", + "Display open ports with service names", + "Use threading to speed up scanning" + ], + "tech_stack": ["Python", "socket module", "threading module"], + "roadmap": [ + "Step 1: Import socket and threading modules", + "Step 2: Write a function to test a single port", + "Step 3: Loop through the port range and test each", + "Step 4: Add threading to run scans concurrently", + "Step 5: Map common ports to service names", + "Step 6: Display results sorted by port number", + "Step 7: Add input validation for host and port range" + ], + "resources": [ + "Python socket docs: https://docs.python.org/3/library/socket.html", + "Python threading: https://docs.python.org/3/library/threading.html", + "OWASP testing guide: https://owasp.org/www-project-web-security-testing-guide" + ], + "starter_code": "starter_code/port_scanner.py" + }, + { + "id": 18, + "title": "Typing Speed Test", + "skills": ["HTML", "CSS", "JavaScript"], + "level": "Beginner", + "interest": "Games", + "time": "Medium", + "description": "A browser-based typing test that measures words per minute and accuracy. Great for practising timers, string comparison, and dynamic DOM updates.", + "features": [ + "Display a random passage to type", + "Start timer on first keypress", + "Highlight correct and incorrect characters in real time", + "Show WPM and accuracy on completion" + ], + "tech_stack": ["HTML", "CSS", "JavaScript"], + "roadmap": [ + "Step 1: Store a list of sample passages", + "Step 2: Display a random passage in the UI", + "Step 3: Listen for keypress events in the input field", + "Step 4: Start the timer on the first keypress", + "Step 5: Compare typed characters to the passage character by character", + "Step 6: Highlight correct characters green and errors red", + "Step 7: Stop the timer when the passage is complete", + "Step 8: Calculate and display WPM and accuracy" + ], + "resources": [ + "JavaScript timers: https://developer.mozilla.org/en-US/docs/Web/API/setInterval", + "JavaScript string methods: https://javascript.info/string", + "MDN keyboard events: https://developer.mozilla.org/en-US/docs/Web/API/KeyboardEvent" + ], + "starter_code": "starter_code/typing_test.html" + }, + { + "id": 19, + "title": "Course Progress Tracker", + "skills": ["Python"], + "level": "Intermediate", + "interest": "Education", + "time": "Medium", + "description": "A CLI tool to track progress through online courses. Users can add courses, mark lessons complete, and see a visual progress bar per course.", + "features": [ + "Add courses with a total lesson count", + "Mark individual lessons as complete", + "Display a text progress bar per course", + "Save and load state from a JSON file" + ], + "tech_stack": ["Python", "json module", "os module"], + "roadmap": [ + "Step 1: Define the course data structure", + "Step 2: Write add_course() and add_lesson() functions", + "Step 3: Implement mark_complete() logic", + "Step 4: Build a text progress bar renderer", + "Step 5: Write JSON save and load functions", + "Step 6: Create a menu loop for user interaction", + "Step 7: Display all courses with progress on startup" + ], + "resources": [ + "Python JSON module: https://docs.python.org/3/library/json.html", + "Real Python CLI apps: https://realpython.com/command-line-interfaces-python-argparse", + "Python os module: https://docs.python.org/3/library/os.html" + ], + "starter_code": "starter_code/course_tracker.py" } -] - - - +] \ No newline at end of file diff --git a/scripts/cluster_projects.py b/scripts/cluster_projects.py new file mode 100644 index 00000000..01fd3e88 --- /dev/null +++ b/scripts/cluster_projects.py @@ -0,0 +1,213 @@ +""" +scripts/cluster_projects.py + +Precomputes K-Means cluster assignments for all projects in data/projects.json +and writes the result to data/clusters.json. + +Run this script whenever projects.json changes: + python scripts/cluster_projects.py + +Requirements: + pip install scikit-learn + +Output format (data/clusters.json): + { + "k": 4, + "clusters": { + "1": 0, + "2": 2, + ... + }, + "members": { + "0": [1, 7, 10], + "1": [3, 9, 19], + ... + } + } +""" + +import json +import math +import os +import sys + +# --------------------------------------------------------------------------- +# Try importing scikit-learn. Give a clear message if it is not installed. +# --------------------------------------------------------------------------- +try: + from sklearn.cluster import KMeans + from sklearn.preprocessing import MultiLabelBinarizer +except ImportError: + sys.exit( + "scikit-learn is required. Install it with: pip install scikit-learn" + ) + +# --------------------------------------------------------------------------- +# Paths — works whether you run the script from the repo root or from scripts/ +# --------------------------------------------------------------------------- +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +REPO_ROOT = os.path.dirname(SCRIPT_DIR) +PROJECTS_PATH = os.path.join(REPO_ROOT, "data", "projects.json") +CLUSTERS_PATH = os.path.join(REPO_ROOT, "data", "clusters.json") + +# --------------------------------------------------------------------------- +# Minimum projects needed before clustering makes sense. +# Below this threshold the script exits with a clear explanation. +# --------------------------------------------------------------------------- +MIN_PROJECTS = 10 + + +def load_projects(path: str) -> list[dict]: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def choose_k(n: int) -> int: + """ + Pick a sensible number of clusters for n projects. + + Rule: k = max(2, round(sqrt(n / 2))) + + Examples: + 10 projects -> k = 2 + 18 projects -> k = 3 + 32 projects -> k = 4 + 50 projects -> k = 5 + + This keeps clusters from being either too broad (k=2 for 50 projects) + or too granular (k=10 for 10 projects). + """ + return max(2, round(math.sqrt(n / 2))) + + +def vectorise(projects: list[dict]): + """ + Convert categorical project attributes into a numeric matrix. + + Each project becomes a binary vector with one dimension per unique value + of: skills, level, interest, and time. + + Example row for a project with skills=["Python"], level="Beginner", + interest="Data", time="Low": + + [Python=1, HTML=0, ..., Beginner=1, Intermediate=0, ..., Data=1, ...] + + Returns: + X -- numpy array of shape (n_projects, n_features) + labels -- list of column names (for debugging) + """ + mlb_skills = MultiLabelBinarizer() + mlb_level = MultiLabelBinarizer() + mlb_interest = MultiLabelBinarizer() + mlb_time = MultiLabelBinarizer() + + skills_matrix = mlb_skills.fit_transform( + [p["skills"] for p in projects] + ) + level_matrix = mlb_level.fit_transform( + [[p["level"]] for p in projects] + ) + interest_matrix = mlb_interest.fit_transform( + [[p["interest"]] for p in projects] + ) + time_matrix = mlb_time.fit_transform( + [[p["time"]] for p in projects] + ) + + import numpy as np + X = np.hstack([skills_matrix, level_matrix, interest_matrix, time_matrix]) + + labels = ( + list(mlb_skills.classes_) + + list(mlb_level.classes_) + + list(mlb_interest.classes_) + + list(mlb_time.classes_) + ) + + return X, labels + + +def run_clustering(projects: list[dict], k: int) -> dict: + """ + Run K-Means and return the cluster assignments as a dict. + + Returns a dict with three keys: + k -- the number of clusters used + clusters -- {project_id: cluster_id, ...} + members -- {cluster_id: [project_id, ...], ...} + """ + X, feature_labels = vectorise(projects) + + km = KMeans( + n_clusters=k, + n_init=20, # run 20 times and keep the best result + random_state=42, # reproducible output + ) + km.fit(X) + + clusters: dict[str, int] = {} + members: dict[str, list] = {str(i): [] for i in range(k)} + + for project, cluster_id in zip(projects, km.labels_): + pid = str(project["id"]) + cid = int(cluster_id) + clusters[pid] = cid + members[str(cid)].append(project["id"]) + + return { + "k": k, + "clusters": clusters, + "members": members, + } + + +def print_summary(result: dict, projects: list[dict]) -> None: + """Print a human-readable summary of the clustering result.""" + id_to_title = {str(p["id"]): p["title"] for p in projects} + print(f"\nClustered {len(result['clusters'])} projects into {result['k']} groups.\n") + for cid, member_ids in result["members"].items(): + print(f" Cluster {cid} ({len(member_ids)} projects):") + for pid in member_ids: + print(f" - [{pid}] {id_to_title.get(str(pid), '?')}") + print() + + +def main(): + # ------------------------------------------------------------------ + # 1. Load projects + # ------------------------------------------------------------------ + if not os.path.exists(PROJECTS_PATH): + sys.exit(f"projects.json not found at: {PROJECTS_PATH}") + + projects = load_projects(PROJECTS_PATH) + + # ------------------------------------------------------------------ + # 2. Guard: need enough projects for clustering to be meaningful + # ------------------------------------------------------------------ + if len(projects) < MIN_PROJECTS: + sys.exit( + f"Only {len(projects)} project(s) found. " + f"Clustering requires at least {MIN_PROJECTS}. " + f"Add more projects to data/projects.json first." + ) + + # ------------------------------------------------------------------ + # 3. Choose k and run clustering + # ------------------------------------------------------------------ + k = choose_k(len(projects)) + print(f"Found {len(projects)} projects. Using k={k} clusters.") + + result = run_clustering(projects, k) + print_summary(result, projects) + + # ------------------------------------------------------------------ + # 4. Write output + # ------------------------------------------------------------------ + with open(CLUSTERS_PATH, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(f"Cluster assignments written to: {CLUSTERS_PATH}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_recommender.py b/test_recommender.py new file mode 100644 index 00000000..b25f85b5 --- /dev/null +++ b/test_recommender.py @@ -0,0 +1,181 @@ +# test_recommender.py +# Run from the repo root with: python test_recommender.py + +import sys +import os + +# Make sure imports resolve from the repo root regardless of where Python +# looks by default. +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from utils.recommender import ( + get_recommendations, + validate_recommendation_inputs, + _get_related, + _load_clusters, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def passed(label): + print(f" PASS {label}") + +def failed(label, detail): + print(f" FAIL {label}") + print(f" {detail}") + +def section(title): + print(f"\n{title}") + print("-" * len(title)) + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + +section("Input validation") + +errors = validate_recommendation_inputs("", "Beginner", "Data", "Low") +if errors: + passed("empty skills caught") +else: + failed("empty skills caught", "expected an error, got none") + +errors = validate_recommendation_inputs("Python", "", "Data", "Low") +if errors: + passed("empty level caught") +else: + failed("empty level caught", "expected an error, got none") + +errors = validate_recommendation_inputs("Python", "Beginner", "Data", "Low") +if not errors: + passed("valid inputs pass through cleanly") +else: + failed("valid inputs pass through cleanly", f"unexpected errors: {errors}") + +# --------------------------------------------------------------------------- +# Return shape +# --------------------------------------------------------------------------- + +section("Return shape") + +result = get_recommendations("Python", "Beginner", "Data", "Low") + +if isinstance(result, dict): + passed("get_recommendations returns a dict") +else: + failed("get_recommendations returns a dict", f"got {type(result)}") + +if "recommendations" in result: + passed("dict has 'recommendations' key") +else: + failed("dict has 'recommendations' key", f"keys found: {list(result.keys())}") + +if "related" in result: + passed("dict has 'related' key") +else: + failed("dict has 'related' key", f"keys found: {list(result.keys())}") + +# --------------------------------------------------------------------------- +# Recommendations list +# --------------------------------------------------------------------------- + +section("Recommendations") + +recs = result["recommendations"] + +if isinstance(recs, list): + passed(f"recommendations is a list ({len(recs)} result(s))") +else: + failed("recommendations is a list", f"got {type(recs)}") + +if len(recs) <= 3: + passed(f"respects MAX_RESULTS cap (got {len(recs)})") +else: + failed("respects MAX_RESULTS cap", f"got {len(recs)} results") + +required_fields = {"id", "title", "skills", "level", "interest", "time"} +all_valid = all(required_fields.issubset(p.keys()) for p in recs) +if all_valid: + passed("all results have required fields") +else: + failed("all results have required fields", "one or more fields missing") + +# High time should return >= results as Low (it opens up more projects) +high_recs = get_recommendations("Python", "Beginner", "Data", "High")["recommendations"] +low_recs = get_recommendations("Python", "Beginner", "Data", "Low")["recommendations"] +if len(high_recs) >= len(low_recs): + passed("High time availability returns >= results than Low") +else: + failed("High time availability returns >= results than Low", + f"High={len(high_recs)}, Low={len(low_recs)}") + +# Nonsense input should return empty recommendations, not crash +junk = get_recommendations("cobol_fortran_brainfuck", "Expert", "Knitting", "Low")["recommendations"] +if isinstance(junk, list) and len(junk) == 0: + passed("no-match input returns empty recommendations") +else: + failed("no-match input returns empty recommendations", f"got: {junk}") + +# --------------------------------------------------------------------------- +# Skill alias normalisation +# --------------------------------------------------------------------------- + +section("Skill alias normalisation") + +js_results = get_recommendations("js", "Beginner", "Web", "Low")["recommendations"] +full_results = get_recommendations("javascript", "Beginner", "Web", "Low")["recommendations"] +if js_results == full_results: + passed("'js' alias resolves to 'javascript'") +else: + failed("'js' alias resolves to 'javascript'", + f"js={[p['title'] for p in js_results]}, " + f"javascript={[p['title'] for p in full_results]}") + +# --------------------------------------------------------------------------- +# Related projects (soft — skipped if clusters.json missing) +# --------------------------------------------------------------------------- + +section("Related projects (requires clusters.json)") + +clusters_path = os.path.join("data", "clusters.json") + +if not os.path.exists(clusters_path): + print(" SKIP clusters.json not found — run: python scripts/cluster_projects.py") +else: + cluster_data = _load_clusters() + all_projects = __import__( + "utils.data_loader", fromlist=["load_all_projects"] + ).load_all_projects() + + rec_result = get_recommendations("Python", "Beginner", "Data", "Low") + recs = rec_result["recommendations"] + related = rec_result["related"] + + if isinstance(related, list): + passed(f"related is a list ({len(related)} result(s))") + else: + failed("related is a list", f"got {type(related)}") + + if len(related) <= 3: + passed(f"related respects MAX_RELATED cap (got {len(related)})") + else: + failed("related respects MAX_RELATED cap", f"got {len(related)}") + + if recs: + rec_ids = [p["id"] for p in recs] + overlap = [p for p in related if p["id"] in rec_ids] + if not overlap: + passed("related projects don't repeat recommended ones") + else: + failed("related projects don't repeat recommended ones", + f"overlap: {[p['title'] for p in overlap]}") + else: + print(" SKIP no recommendations returned, skipping overlap check") + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- + +print("\nDone.\n") \ No newline at end of file diff --git a/utils/recommender.py b/utils/recommender.py index c5a4f218..6170b976 100644 --- a/utils/recommender.py +++ b/utils/recommender.py @@ -1,12 +1,18 @@ # utils/recommender.py -# Contains all recommendation logic: scoring and filtering projects. +# Contains all recommendation logic: scoring, filtering, and related projects. # Kept separate from routing so it can be tested and extended independently. +import json +import os + from utils.data_loader import load_all_projects # Maximum number of recommendations returned to the user MAX_RESULTS = 3 +# Maximum number of "you might also like" projects returned alongside results +MAX_RELATED = 3 + # Scoring weights used by the recommendation engine. # Higher weights mean that criterion has more influence # on the final recommendation score. @@ -17,18 +23,29 @@ "time": 1, } - -# Common aliases and abbreviations for skills -# This improves recommendation accuracy by normalizing user input +# Common aliases and abbreviations for skills. +# This improves recommendation accuracy by normalizing user input. SKILL_ALIASES = { - "js": "javascript", - "py": "python", - "html5": "html", - "css3": "css", - "c++": "cpp", - "web dev": "javascript" + "js": "javascript", + "py": "python", + "html5": "html", + "css3": "css", + "c++": "cpp", + "web dev": "javascript", } +# Path to the precomputed cluster assignments. +# Generated by: python scripts/cluster_projects.py +_CLUSTERS_PATH = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "data", + "clusters.json", +) + + +# --------------------------------------------------------------------------- +# Skill parsing +# --------------------------------------------------------------------------- def parse_skills(skills_string): """ @@ -36,99 +53,158 @@ def parse_skills(skills_string): a normalized lowercase list. Example: - "JS, HTML5, CSS3" -> ["javascript", "html", "css"] + "JS, HTML5, CSS3" -> ["javascript", "html", "css"] """ - raw_skills = [ s.strip().lower() for s in skills_string.split(",") if s.strip() ] + return [SKILL_ALIASES.get(skill, skill) for skill in raw_skills] - normalized_skills = [ - SKILL_ALIASES.get(skill, skill) - for skill in raw_skills - ] - - return normalized_skills +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- -def score_single_project( - project, user_skills, - level, interest, time_availability): +def score_single_project(project, user_skills, level, interest, time_availability): """ Calculate a numeric relevance score for one project. - Each matching criterion adds points: + Scoring rules: - Each matching skill: +3 - Level match: +2 - Interest match: +2 - Time match: +1 - Returns an integer score (0 means no match at all). + Time filtering: projects that require MORE time than the user has + available are excluded entirely (score returned as 0). + + Returns an integer score (0 means no match or time mismatch). """ - # Compare time availability, return results with the same time availibity or lower. - TIME_AVAILABILITY = ['low', 'medium', 'high'] - time_availability_index = TIME_AVAILABILITY.index(time_availability.strip().lower()) - valid_time = TIME_AVAILABILITY[ : time_availability_index + 1 ] - + TIME_RANKS = ["low", "medium", "high"] + + user_time = time_availability.strip().lower() + project_time = project.get("time", "").strip().lower() + + # If the project needs more time than the user has, exclude it. + if project_time not in TIME_RANKS or user_time not in TIME_RANKS: + return 0 + if TIME_RANKS.index(project_time) > TIME_RANKS.index(user_time): + return 0 + score = 0 - # Compare user's skills against the project's required skills + # Skills: count how many of the user's skills the project requires. project_skills = [s.lower() for s in project.get("skills", [])] - # Count how many user skills overlap with the - # skills required by the current project. matched_skills = sum(1 for skill in user_skills if skill in project_skills) - # Add weighted points based on the number of matching skills. - # More overlapping skills result in a higher recommendation score. score += matched_skills * SCORING_WEIGHTS["skill"] - # Award points for each additional matching criterion if project.get("level", "").lower() == level.lower(): score += SCORING_WEIGHTS["level"] if project.get("interest", "").lower() == interest.lower(): score += SCORING_WEIGHTS["interest"] - if project.get("time", "").lower() == time_availability.lower(): + if project_time == user_time: score += SCORING_WEIGHTS["time"] - if project.get("time", "").lower() in valid_time : - return score - return 0 + return score + + +# --------------------------------------------------------------------------- +# Clustering helpers +# --------------------------------------------------------------------------- + +def _load_clusters(): + """ + Load clusters.json if it exists. + + Returns the parsed dict, or None if the file is missing or unreadable. + A missing file is a soft failure — the recommender still works, + it just won't return related projects. + """ + if not os.path.exists(_CLUSTERS_PATH): + return None + try: + with open(_CLUSTERS_PATH, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, OSError): + return None + + +def _get_related(recommended_ids, all_projects, cluster_data): + """ + Find projects in the same cluster(s) as the recommended projects, + excluding the ones already recommended. + + Returns up to MAX_RELATED project dicts. + """ + clusters = cluster_data.get("clusters", {}) # {str(pid): cid} + members = cluster_data.get("members", {}) # {str(cid): [pid, ...]} + + # Collect which clusters the recommended projects belong to. + relevant_cluster_ids = set() + for pid in recommended_ids: + cid = clusters.get(str(pid)) + if cid is not None: + relevant_cluster_ids.add(str(cid)) + if not relevant_cluster_ids: + return [] + + # Gather candidate IDs from those clusters, excluding already recommended. + candidate_ids = [] + for cid in relevant_cluster_ids: + for pid in members.get(cid, []): + if pid not in recommended_ids and pid not in candidate_ids: + candidate_ids.append(pid) + + id_to_project = {p["id"]: p for p in all_projects} + related = [id_to_project[pid] for pid in candidate_ids if pid in id_to_project] + return related[:MAX_RELATED] + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- def get_recommendations(skills_string, level, interest, time_availability): """ - Return the top N recommended projects for the given user inputs. - - Steps: - 1. Parse the raw skills input into a list. - 2. Score every project in the dataset. - 3. Drop projects with a score of zero (no overlap at all). - 4. Sort by score descending. - 5. Return the top MAX_RESULTS projects. + Return the top N recommended projects for the given user inputs, + along with related projects from the same cluster. + + Return shape: + { + "recommendations": [ , ... ], # up to MAX_RESULTS + "related": [ , ... ], # up to MAX_RELATED + } + + The "related" list is empty when clusters.json does not exist yet. + Run scripts/cluster_projects.py to generate it. """ - user_skills = parse_skills(skills_string) + user_skills = parse_skills(skills_string) all_projects = load_all_projects() - scored_projects = [] - + scored = [] for project in all_projects: score = score_single_project( project, user_skills, level, interest, time_availability ) - # Ignore projects with a score of 0 since they - # have no meaningful overlap with the user's inputs. - if score > 0: - scored_projects.append({"project": project, "score": score}) + if score >= SCORING_WEIGHTS["skill"]: + scored.append({"project": project, "score": score}) + + scored.sort(key=lambda item: item["score"], reverse=True) + top_projects = [item["project"] for item in scored[:MAX_RESULTS]] + top_ids = [p["id"] for p in top_projects] - # Sort projects in descending order so the - # most relevant recommendations appear first. - scored_projects.sort(key=lambda item: item["score"], reverse=True) + cluster_data = _load_clusters() + related = _get_related(top_ids, all_projects, cluster_data) if cluster_data else [] - # Return only the project dicts, not the score metadata - return [item["project"] for item in scored_projects[:MAX_RESULTS]] + return { + "recommendations": top_projects, + "related": related, + } def validate_recommendation_inputs(skills, level, interest, time_availability): @@ -150,4 +226,4 @@ def validate_recommendation_inputs(skills, level, interest, time_availability): if not time_availability or not time_availability.strip(): errors.append("Please select your time availability.") - return errors + return errors \ No newline at end of file