From 3adfb0a51d4b5046277aed88e90759c95662162c Mon Sep 17 00:00:00 2001 From: upasana-2006 Date: Mon, 25 May 2026 16:41:02 +0000 Subject: [PATCH] Add lightweight ML-based recommendation scoring --- tests/test_basic.py | 19 +++++ utils/recommender.py | 173 ++++++++++++++++++++++++------------------- 2 files changed, 115 insertions(+), 77 deletions(-) diff --git a/tests/test_basic.py b/tests/test_basic.py index 982182cd..2963d7f3 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -402,3 +402,22 @@ def test_project_links_have_noopener(): print(f"\n{passed} passed, {failed} failed out of {passed + failed} tests") if failed > 0: sys.exit(1) + +def test_ml_similarity_score_returns_float(): + from utils.recommender import ml_similarity_score, parse_skills + projects = load_all_projects() + score = ml_similarity_score( + projects[0], + parse_skills("Python"), + "Beginner", + "Data", + "Low", + projects, + ) + assert isinstance(score, float) + assert score >= 0 + +def test_ml_recommendation_prefers_relevant_python_data_project(): + results = get_recommendations("Python, pandas", "Intermediate", "Data", "High") + titles = [project["title"] for project in results] + assert any("Data" in title or "Pipeline" in title for title in titles) diff --git a/utils/recommender.py b/utils/recommender.py index c5a4f218..716f6bd0 100644 --- a/utils/recommender.py +++ b/utils/recommender.py @@ -1,89 +1,112 @@ # utils/recommender.py # Contains all recommendation logic: scoring and filtering projects. -# Kept separate from routing so it can be tested and extended independently. + +import math +import re +from collections import Counter from utils.data_loader import load_all_projects -# Maximum number of recommendations returned to the user MAX_RESULTS = 3 -# Scoring weights used by the recommendation engine. -# Higher weights mean that criterion has more influence -# on the final recommendation score. SCORING_WEIGHTS = { - "skill": 3, - "level": 2, + "skill": 3, + "level": 2, "interest": 2, - "time": 1, + "time": 1, } - -# Common aliases and abbreviations for skills -# This improves recommendation accuracy by normalizing user input SKILL_ALIASES = { "js": "javascript", "py": "python", "html5": "html", "css3": "css", "c++": "cpp", - "web dev": "javascript" + "web dev": "javascript", } - def parse_skills(skills_string): - """ - Convert a raw comma-separated skills string into - a normalized lowercase list. - - Example: - "JS, HTML5, CSS3" -> ["javascript", "html", "css"] - """ - raw_skills = [ s.strip().lower() for s in skills_string.split(",") if s.strip() ] - - normalized_skills = [ - SKILL_ALIASES.get(skill, skill) - for skill in raw_skills + return [SKILL_ALIASES.get(skill, skill) for skill in raw_skills] + +def _tokenize(text): + return re.findall(r"[a-z0-9]+", str(text).lower()) + +def _project_text(project): + parts = [ + project.get("title", ""), + project.get("level", ""), + project.get("interest", ""), + project.get("time", ""), + project.get("description", ""), + " ".join(project.get("skills", [])), + " ".join(project.get("tech_stack", [])), + " ".join(project.get("features", [])), ] + return " ".join(parts) + +def _user_text(user_skills, level, interest, time_availability): + return " ".join(user_skills + [level, interest, time_availability]) + +def _tf(tokens): + counts = Counter(tokens) + total = len(tokens) or 1 + return {token: count / total for token, count in counts.items()} + +def _idf(documents): + total_docs = len(documents) + idf_scores = {} + + all_tokens = set(token for doc in documents for token in set(doc)) + + for token in all_tokens: + docs_with_token = sum(1 for doc in documents if token in doc) + idf_scores[token] = math.log((1 + total_docs) / (1 + docs_with_token)) + 1 + + return idf_scores - return normalized_skills +def _tfidf_vector(tokens, idf_scores): + tf_scores = _tf(tokens) + return { + token: tf_scores[token] * idf_scores.get(token, 0) + for token in tf_scores + } +def _cosine_similarity(vec_a, vec_b): + shared_tokens = set(vec_a) & set(vec_b) -def score_single_project( - project, user_skills, - level, interest, time_availability): - """ - Calculate a numeric relevance score for one project. + dot_product = sum(vec_a[token] * vec_b[token] for token in shared_tokens) + magnitude_a = math.sqrt(sum(value ** 2 for value in vec_a.values())) + magnitude_b = math.sqrt(sum(value ** 2 for value in vec_b.values())) - Each matching criterion adds points: - - Each matching skill: +3 - - Level match: +2 - - Interest match: +2 - - Time match: +1 + if magnitude_a == 0 or magnitude_b == 0: + return 0 - Returns an integer score (0 means no match at all). - """ - # Compare time availability, return results with the same time availibity or lower. - TIME_AVAILABILITY = ['low', 'medium', 'high'] - time_availability_index = TIME_AVAILABILITY.index(time_availability.strip().lower()) - valid_time = TIME_AVAILABILITY[ : time_availability_index + 1 ] - + return dot_product / (magnitude_a * magnitude_b) + +def ml_similarity_score(project, user_skills, level, interest, time_availability, all_projects): + project_documents = [_tokenize(_project_text(p)) for p in all_projects] + user_tokens = _tokenize(_user_text(user_skills, level, interest, time_availability)) + + idf_scores = _idf(project_documents + [user_tokens]) + + user_vector = _tfidf_vector(user_tokens, idf_scores) + project_vector = _tfidf_vector(_tokenize(_project_text(project)), idf_scores) + + return _cosine_similarity(user_vector, project_vector) + +def score_single_project(project, user_skills, level, interest, time_availability): score = 0 - # Compare user's skills against the project's required skills project_skills = [s.lower() for s in project.get("skills", [])] - # Count how many user skills overlap with the - # skills required by the current project. matched_skills = sum(1 for skill in user_skills if skill in project_skills) - # Add weighted points based on the number of matching skills. - # More overlapping skills result in a higher recommendation score. + score += matched_skills * SCORING_WEIGHTS["skill"] - # Award points for each additional matching criterion if project.get("level", "").lower() == level.lower(): score += SCORING_WEIGHTS["level"] @@ -93,49 +116,45 @@ def score_single_project( if project.get("time", "").lower() == time_availability.lower(): score += SCORING_WEIGHTS["time"] - if project.get("time", "").lower() in valid_time : - return score - return 0 - + return score def get_recommendations(skills_string, level, interest, time_availability): - """ - Return the top N recommended projects for the given user inputs. - - Steps: - 1. Parse the raw skills input into a list. - 2. Score every project in the dataset. - 3. Drop projects with a score of zero (no overlap at all). - 4. Sort by score descending. - 5. Return the top MAX_RESULTS projects. - """ user_skills = parse_skills(skills_string) all_projects = load_all_projects() scored_projects = [] for project in all_projects: - score = score_single_project( - project, user_skills, level, interest, time_availability + rule_score = score_single_project( + project, + user_skills, + level, + interest, + time_availability, + ) + + similarity_score = ml_similarity_score( + project, + user_skills, + level, + interest, + time_availability, + all_projects, ) - # Ignore projects with a score of 0 since they - # have no meaningful overlap with the user's inputs. - if score > 0: - scored_projects.append({"project": project, "score": score}) - # Sort projects in descending order so the - # most relevant recommendations appear first. + final_score = rule_score + similarity_score + + if final_score > 0: + scored_projects.append({ + "project": project, + "score": final_score, + }) + scored_projects.sort(key=lambda item: item["score"], reverse=True) - # Return only the project dicts, not the score metadata return [item["project"] for item in scored_projects[:MAX_RESULTS]] - def validate_recommendation_inputs(skills, level, interest, time_availability): - """ - Validate all four required fields. - Returns a list of error strings. An empty list means all inputs are valid. - """ errors = [] if not skills or not skills.strip():