Job_Catalyst-AI / precompute_embeddings.py
shreyan67's picture
Upload precompute_embeddings.py
5bc8f7d verified
import string
import numpy as np
import faiss
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
# ----------------- Load Model -----------------
MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)
class JobRecommendationSystem:
def __init__(self, jobs_csv):
"""Initialize the system and load enriched job data from CSV file."""
self.jobs_df = pd.read_csv(jobs_csv)
# Ensure apply_link exists
if "apply_link" not in self.jobs_df.columns:
self.jobs_df["apply_link"] = None
# --- Safe column concat ---
def safe_col(col):
return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""
# Build job_text
self.jobs_df["job_text"] = (
safe_col("workplace") +
safe_col("position") +
safe_col("job_role_and_duties") +
safe_col("requisite_skill") +
safe_col("benefits") +
safe_col("industry_id") +
safe_col("formatted_work_type") +
safe_col("work_type") +
safe_col("formatted_experience_level") +
safe_col("country") +
safe_col("state") +
safe_col("city")
)
self.jobs_texts = self.jobs_df["job_text"].tolist()
self.job_info = self.jobs_df.copy()
# --- Load or compute embeddings ---
try:
self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
print("✅ Loaded precomputed embeddings from job_embeddings.npy")
except FileNotFoundError:
print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
self.job_embeddings = MODEL.encode(
self.jobs_texts,
convert_to_numpy=True,
batch_size=32,
show_progress_bar=True
).astype(np.float16)
np.save("job_embeddings.npy", self.job_embeddings)
print("✅ Saved embeddings to job_embeddings.npy")
# --- Build FAISS index (global, on all jobs) ---
self.dim = self.job_embeddings.shape[1]
self.index = faiss.IndexFlatIP(self.dim)
self.index.add(self.job_embeddings.astype(np.float16))
# --- Precompute TF-IDF once ---
self.vectorizer = TfidfVectorizer()
self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)
# ----------------- Helpers -----------------
def clean_text(self, text):
"""Lowercase, strip punctuation, clean text."""
return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()
def filter_top_jobs(self, resume_text, top_n=500):
"""Use TF-IDF to preselect most relevant jobs (fast)."""
resume_vector = self.vectorizer.transform([resume_text])
similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
top_indices = np.argsort(similarity_scores)[-top_n:]
return (
[self.jobs_texts[i] for i in top_indices],
self.job_info.iloc[top_indices].reset_index(drop=True),
self.job_embeddings[top_indices],
)
def recommend_jobs(self, resume_text, top_n=20):
"""Recommend jobs using FAISS similarity search + deduplication."""
resume_text = self.clean_text(resume_text)
filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
self.filter_top_jobs(resume_text)
)
# Encode resume
resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)
# Build temporary FAISS index on filtered jobs
index = faiss.IndexFlatIP(self.dim)
index.add(filtered_embeddings.astype(np.float16))
# Search more than top_n to handle duplicates
distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
results = filtered_jobs_df.iloc[indices[0]]
# Deduplicate by job_id and return top_n
results = results.drop_duplicates(subset=["job_id"]).head(top_n)
recommended_jobs = results.to_dict(orient="records")
return {"recommended_jobs": recommended_jobs}