Spaces:
Sleeping
Sleeping
| import string | |
| import numpy as np | |
| import faiss | |
| import pandas as pd | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # ----------------- Load Model ----------------- | |
| MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu") | |
| MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8) | |
| class JobRecommendationSystem: | |
| def __init__(self, jobs_csv): | |
| """Initialize the system and load enriched job data from CSV file.""" | |
| self.jobs_df = pd.read_csv(jobs_csv) | |
| # Ensure apply_link exists | |
| if "apply_link" not in self.jobs_df.columns: | |
| self.jobs_df["apply_link"] = None | |
| # --- Safe column concat --- | |
| def safe_col(col): | |
| return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else "" | |
| # Build job_text | |
| self.jobs_df["job_text"] = ( | |
| safe_col("workplace") + | |
| safe_col("position") + | |
| safe_col("job_role_and_duties") + | |
| safe_col("requisite_skill") + | |
| safe_col("benefits") + | |
| safe_col("industry_id") + | |
| safe_col("formatted_work_type") + | |
| safe_col("work_type") + | |
| safe_col("formatted_experience_level") + | |
| safe_col("country") + | |
| safe_col("state") + | |
| safe_col("city") | |
| ) | |
| self.jobs_texts = self.jobs_df["job_text"].tolist() | |
| self.job_info = self.jobs_df.copy() | |
| # --- Load or compute embeddings --- | |
| try: | |
| self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16) | |
| print("✅ Loaded precomputed embeddings from job_embeddings.npy") | |
| except FileNotFoundError: | |
| print("⚠️ job_embeddings.npy not found. Generating embeddings now...") | |
| self.job_embeddings = MODEL.encode( | |
| self.jobs_texts, | |
| convert_to_numpy=True, | |
| batch_size=32, | |
| show_progress_bar=True | |
| ).astype(np.float16) | |
| np.save("job_embeddings.npy", self.job_embeddings) | |
| print("✅ Saved embeddings to job_embeddings.npy") | |
| # --- Build FAISS index (global, on all jobs) --- | |
| self.dim = self.job_embeddings.shape[1] | |
| self.index = faiss.IndexFlatIP(self.dim) | |
| self.index.add(self.job_embeddings.astype(np.float16)) | |
| # --- Precompute TF-IDF once --- | |
| self.vectorizer = TfidfVectorizer() | |
| self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts) | |
| # ----------------- Helpers ----------------- | |
| def clean_text(self, text): | |
| """Lowercase, strip punctuation, clean text.""" | |
| return text.lower().translate(str.maketrans("", "", string.punctuation)).strip() | |
| def filter_top_jobs(self, resume_text, top_n=500): | |
| """Use TF-IDF to preselect most relevant jobs (fast).""" | |
| resume_vector = self.vectorizer.transform([resume_text]) | |
| similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten() | |
| top_indices = np.argsort(similarity_scores)[-top_n:] | |
| return ( | |
| [self.jobs_texts[i] for i in top_indices], | |
| self.job_info.iloc[top_indices].reset_index(drop=True), | |
| self.job_embeddings[top_indices], | |
| ) | |
| def recommend_jobs(self, resume_text, top_n=20): | |
| """Recommend jobs using FAISS similarity search + deduplication.""" | |
| resume_text = self.clean_text(resume_text) | |
| filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = ( | |
| self.filter_top_jobs(resume_text) | |
| ) | |
| # Encode resume | |
| resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16) | |
| # Build temporary FAISS index on filtered jobs | |
| index = faiss.IndexFlatIP(self.dim) | |
| index.add(filtered_embeddings.astype(np.float16)) | |
| # Search more than top_n to handle duplicates | |
| distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2) | |
| results = filtered_jobs_df.iloc[indices[0]] | |
| # Deduplicate by job_id and return top_n | |
| results = results.drop_duplicates(subset=["job_id"]).head(top_n) | |
| recommended_jobs = results.to_dict(orient="records") | |
| return {"recommended_jobs": recommended_jobs} | |