# semantic_ranker.py | |
from typing import Optional | |
import numpy as np | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
_model = None | |
def _lazy_model(): | |
global _model | |
if _model is None: | |
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # CPU-fast | |
return _model | |
def score_courses(df: pd.DataFrame, query: str, text_cols=("name","subject")) -> pd.DataFrame: | |
"""Add 'sem_score' column based on cosine similarity to query; higher is better.""" | |
if not query or not query.strip(): | |
df["sem_score"] = 0.0 | |
return df | |
model = _lazy_model() | |
corpus = (df[list(text_cols)] | |
.fillna("") | |
.agg(" - ".join, axis=1) | |
.tolist()) | |
q_emb = model.encode([query], convert_to_tensor=True, normalize_embeddings=True) | |
c_emb = model.encode(corpus, convert_to_tensor=True, normalize_embeddings=True) | |
sims = util.cos_sim(q_emb, c_emb).cpu().numpy().ravel() | |
df = df.copy() | |
df["sem_score"] = sims | |
# sort with semantic score first, then keep original order for stability | |
df = df.sort_values(by="sem_score", ascending=False) | |
return df | |