Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from sklearn.metrics.pairwise import cosine_similarity, linear_kernel | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sentence_transformers import SentenceTransformer | |
| from huggingface_hub import hf_hub_download | |
| # ---------------- Load Sentence-BERT Model ---------------- | |
| def load_sentence_model(): | |
| try: | |
| model = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu") # local | |
| except Exception: | |
| model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device="cpu") # fallback | |
| return torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) | |
| MODEL = load_sentence_model() | |
| # ---------------- Job Recommendation System ---------------- | |
| class JobRecommendationSystem: | |
| def __init__(self, jobs_csv: str = "JobsFE.csv"): | |
| # Load CSV from local or HF Hub | |
| if os.path.exists(jobs_csv): | |
| print(f"β Loading dataset locally from {jobs_csv}") | |
| self.jobs_df = pd.read_csv(jobs_csv) | |
| else: | |
| print("π₯ Fetching dataset from Hugging Face Hub...") | |
| dataset_path = hf_hub_download( | |
| repo_id="shreyan67/Job-Catalyst_AI", # your HF dataset repo | |
| filename="JobsFE.csv", | |
| repo_type="dataset" | |
| ) | |
| self.jobs_df = pd.read_csv(dataset_path) | |
| self.jobs_df = self.jobs_df.fillna("") | |
| text_cols = [ | |
| "position", "job_role_and_duties", "requisite_skill", "benefits", | |
| "formatted_experience_level", "formatted_work_type", "work_type", | |
| "city", "state", "country", | |
| ] | |
| self.jobs_df["job_text"] = self.jobs_df[text_cols].astype(str).agg(" ".join, axis=1) | |
| self.jobs_df = self.jobs_df.drop_duplicates(subset=["job_text"]).reset_index(drop=True) | |
| self.jobs_texts = self.jobs_df["job_text"].tolist() | |
| print("β‘ Precomputing TF-IDF vectors...") | |
| self.vectorizer = TfidfVectorizer(max_features=10000) | |
| self.job_tfidf_matrix = self.vectorizer.fit_transform(self.jobs_texts) | |
| # Load embeddings from HF if available | |
| try: | |
| emb_path = hf_hub_download( | |
| repo_id="shreyan67/Job-Catalyst_AI", | |
| filename="job_embeddings.npy", | |
| repo_type="dataset" | |
| ) | |
| print("β Loaded precomputed embeddings from Hugging Face Hub") | |
| self.job_embeddings = np.load(emb_path) | |
| except Exception: | |
| print("β οΈ No precomputed embeddings found. Generating now...") | |
| self.job_embeddings = MODEL.encode( | |
| self.jobs_texts, | |
| batch_size=64, | |
| show_progress_bar=True, | |
| convert_to_numpy=True, | |
| ) | |
| np.save("job_embeddings.npy", self.job_embeddings) | |
| print("β Saved embeddings to job_embeddings.npy") | |
| def filter_top_jobs(self, resume_text: str, top_k: int = 500): | |
| resume_vector = self.vectorizer.transform([resume_text]) | |
| cosine_similarities = linear_kernel(resume_vector, self.job_tfidf_matrix).flatten() | |
| top_indices = cosine_similarities.argsort()[-top_k:][::-1] | |
| return ( | |
| self.jobs_df.iloc[top_indices].reset_index(drop=True), | |
| self.job_embeddings[top_indices], | |
| ) | |
| def recommend_jobs(self, resume_text: str, top_n: int = 20): | |
| filtered_jobs_df, filtered_embeddings = self.filter_top_jobs(resume_text) | |
| resume_embedding = MODEL.encode(resume_text, convert_to_numpy=True).reshape(1, -1) | |
| similarities = cosine_similarity(resume_embedding, filtered_embeddings)[0] | |
| top_indices = similarities.argsort()[-top_n:][::-1] | |
| recommendations = [] | |
| for idx in top_indices: | |
| job = filtered_jobs_df.iloc[idx] | |
| recommendations.append({ | |
| "job_id": job.get("job_id", ""), | |
| "position": job.get("position", "N/A"), | |
| "workplace": job.get("workplace", "N/A"), | |
| "formatted_work_type": job.get("formatted_work_type", "N/A"), | |
| "remote_allowed": job.get("remote_allowed", "N/A"), | |
| "salary_range": f"{job.get('min_salary','')} - {job.get('max_salary','')} {job.get('currency','')} ({job.get('pay_period','')})", | |
| "experience_level": job.get("formatted_experience_level", "N/A"), | |
| "job_role_and_duties": job.get("job_role_and_duties", "N/A"), | |
| "skills": job.get("requisite_skill", "N/A"), | |
| "benefits": job.get("benefits", "N/A"), | |
| "location": f"{job.get('city','')}, {job.get('state','')}, {job.get('country','')}", | |
| "company_size": job.get("company_size", "N/A"), | |
| "employee_count": job.get("employee_count", "N/A"), | |
| "company_website": job.get("company_website", "N/A"), | |
| "apply_link": job.get("apply_link", job.get("job_posting_url", "")), | |
| "similarity": float(similarities[idx]), | |
| }) | |
| return recommendations | |