Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,3 +437,22 @@ def test_project_links_have_noopener():
print(f"\n{passed} passed, {failed} failed out of {passed + failed} tests")
if failed > 0:
sys.exit(1)

def test_ml_similarity_score_returns_float():
from utils.recommender import ml_similarity_score, parse_skills
projects = load_all_projects()
score = ml_similarity_score(
projects[0],
parse_skills("Python"),
"Beginner",
"Data",
"Low",
projects,
)
assert isinstance(score, float)
assert score >= 0

def test_ml_recommendation_prefers_relevant_python_data_project():
results = get_recommendations("Python, pandas", "Intermediate", "Data", "High")
titles = [project["title"] for project in results]
assert any("Data" in title or "Pipeline" in title for title in titles)
170 changes: 96 additions & 74 deletions utils/recommender.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,115 @@
# utils/recommender.py
# Contains all recommendation logic: scoring and filtering projects.
# Kept separate from routing so it can be tested and extended independently.

import math
import re
from collections import Counter

from utils.data_loader import load_all_projects

# Maximum number of recommendations returned to the user
MAX_RESULTS = 3

# Scoring weights used by the recommendation engine.
# Higher weights mean that criterion has more influence
# on the final recommendation score.
SCORING_WEIGHTS = {
"skill": 3,
"level": 2,
"skill": 3,
"level": 2,
"interest": 2,
"time": 1,
"time": 1,
}


# Common aliases and abbreviations for skills
# This improves recommendation accuracy by normalizing user input
SKILL_ALIASES = {
"js": "javascript",
"py": "python",
"html5": "html",
"css3": "css",
"c++": "cpp",
"web dev": "javascript"
"web dev": "javascript",
}


def parse_skills(skills_string):
"""
Convert a raw comma-separated skills string into
a normalized lowercase list.

Example:
"JS, HTML5, CSS3" -> ["javascript", "html", "css"]
"""

raw_skills = [
s.strip().lower()
for s in skills_string.split(",")
if s.strip()
]

normalized_skills = [
SKILL_ALIASES.get(skill, skill)
for skill in raw_skills
return [SKILL_ALIASES.get(skill, skill) for skill in raw_skills]

def _tokenize(text):
return re.findall(r"[a-z0-9]+", str(text).lower())

def _project_text(project):
parts = [
project.get("title", ""),
project.get("level", ""),
project.get("interest", ""),
project.get("time", ""),
project.get("description", ""),
" ".join(project.get("skills", [])),
" ".join(project.get("tech_stack", [])),
" ".join(project.get("features", [])),
]
return " ".join(parts)

def _user_text(user_skills, level, interest, time_availability):
return " ".join(user_skills + [level, interest, time_availability])

def _tf(tokens):
counts = Counter(tokens)
total = len(tokens) or 1
return {token: count / total for token, count in counts.items()}

def _idf(documents):
total_docs = len(documents)
idf_scores = {}

all_tokens = set(token for doc in documents for token in set(doc))

for token in all_tokens:
docs_with_token = sum(1 for doc in documents if token in doc)
idf_scores[token] = math.log((1 + total_docs) / (1 + docs_with_token)) + 1

return idf_scores

return normalized_skills
def _tfidf_vector(tokens, idf_scores):
tf_scores = _tf(tokens)
return {
token: tf_scores[token] * idf_scores.get(token, 0)
for token in tf_scores
}

def _cosine_similarity(vec_a, vec_b):
shared_tokens = set(vec_a) & set(vec_b)

def score_single_project(
project, user_skills,
level, interest, time_availability):
"""
Calculate a numeric relevance score for one project.
dot_product = sum(vec_a[token] * vec_b[token] for token in shared_tokens)
magnitude_a = math.sqrt(sum(value ** 2 for value in vec_a.values()))
magnitude_b = math.sqrt(sum(value ** 2 for value in vec_b.values()))

Each matching criterion adds points:
- Each matching skill: +3
- Level match: +2
- Interest match: +2
- Time match: +1
if magnitude_a == 0 or magnitude_b == 0:
return 0

Returns an integer score (0 means no match at all).
"""
# Compare time availability, return results with the same time availibity or lower.
TIME_AVAILABILITY = ['low', 'medium', 'high']
time_availability_index = TIME_AVAILABILITY.index(time_availability.strip().lower())
valid_time = TIME_AVAILABILITY[ : time_availability_index + 1 ]

return dot_product / (magnitude_a * magnitude_b)

def ml_similarity_score(project, user_skills, level, interest, time_availability, all_projects):
project_documents = [_tokenize(_project_text(p)) for p in all_projects]
user_tokens = _tokenize(_user_text(user_skills, level, interest, time_availability))

idf_scores = _idf(project_documents + [user_tokens])

user_vector = _tfidf_vector(user_tokens, idf_scores)
project_vector = _tfidf_vector(_tokenize(_project_text(project)), idf_scores)

return _cosine_similarity(user_vector, project_vector)

def score_single_project(project, user_skills, level, interest, time_availability):
score = 0

# Compare user's skills against the project's required skills
project_skills = [SKILL_ALIASES.get(s.lower(), s.lower()) for s in project.get("skills", [])]
# Count how many user skills overlap with the
# skills required by the current project.
matched_skills = sum(1 for skill in user_skills if skill in project_skills)
# Add weighted points based on the number of matching skills.
# More overlapping skills result in a higher recommendation score.

score += matched_skills * SCORING_WEIGHTS["skill"]

# Award points for each additional matching criterion
if project.get("level", "").lower() == level.lower():
score += SCORING_WEIGHTS["level"]

Expand All @@ -93,49 +119,45 @@ def score_single_project(
if project.get("time", "").lower() == time_availability.lower():
score += SCORING_WEIGHTS["time"]

if project.get("time", "").lower() in valid_time :
return score
return 0

return score

def get_recommendations(skills_string, level, interest, time_availability):
"""
Return the top N recommended projects for the given user inputs.

Steps:
1. Parse the raw skills input into a list.
2. Score every project in the dataset.
3. Drop projects with a score of zero (no overlap at all).
4. Sort by score descending.
5. Return the top MAX_RESULTS projects.
"""
user_skills = parse_skills(skills_string)
all_projects = load_all_projects()

scored_projects = []

for project in all_projects:
score = score_single_project(
project, user_skills, level, interest, time_availability
rule_score = score_single_project(
project,
user_skills,
level,
interest,
time_availability,
)

similarity_score = ml_similarity_score(
project,
user_skills,
level,
interest,
time_availability,
all_projects,
)
# Ignore projects with a score of 0 since they
# have no meaningful overlap with the user's inputs.
if score > 0:
scored_projects.append({"project": project, "score": score})

# Sort projects in descending order so the
# most relevant recommendations appear first.
final_score = rule_score + similarity_score

if final_score > 0:
scored_projects.append({
"project": project,
"score": final_score,
})

scored_projects.sort(key=lambda item: item["score"], reverse=True)

# Return only the project dicts, not the score metadata
return [item["project"] for item in scored_projects[:MAX_RESULTS]]


def validate_recommendation_inputs(skills, level, interest, time_availability):
"""
Validate all four required fields.
Returns a list of error strings. An empty list means all inputs are valid.
"""
errors = []

if not skills or not skills.strip():
Expand Down
Loading