Job_Catalyst-AI / model.py
shreyan67's picture
Update model.py
37b08b8 verified
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
# ---------------- Load Sentence-BERT Model ----------------
def load_sentence_model():
try:
model = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu") # local
except Exception:
model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device="cpu") # fallback
return torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
MODEL = load_sentence_model()
# ---------------- Job Recommendation System ----------------
class JobRecommendationSystem:
def __init__(self, jobs_csv: str = "JobsFE.csv"):
# Load CSV from local or HF Hub
if os.path.exists(jobs_csv):
print(f"βœ… Loading dataset locally from {jobs_csv}")
self.jobs_df = pd.read_csv(jobs_csv)
else:
print("πŸ“₯ Fetching dataset from Hugging Face Hub...")
dataset_path = hf_hub_download(
repo_id="shreyan67/Job-Catalyst_AI", # your HF dataset repo
filename="JobsFE.csv",
repo_type="dataset"
)
self.jobs_df = pd.read_csv(dataset_path)
self.jobs_df = self.jobs_df.fillna("")
text_cols = [
"position", "job_role_and_duties", "requisite_skill", "benefits",
"formatted_experience_level", "formatted_work_type", "work_type",
"city", "state", "country",
]
self.jobs_df["job_text"] = self.jobs_df[text_cols].astype(str).agg(" ".join, axis=1)
self.jobs_df = self.jobs_df.drop_duplicates(subset=["job_text"]).reset_index(drop=True)
self.jobs_texts = self.jobs_df["job_text"].tolist()
print("⚑ Precomputing TF-IDF vectors...")
self.vectorizer = TfidfVectorizer(max_features=10000)
self.job_tfidf_matrix = self.vectorizer.fit_transform(self.jobs_texts)
# Load embeddings from HF if available
try:
emb_path = hf_hub_download(
repo_id="shreyan67/Job-Catalyst_AI",
filename="job_embeddings.npy",
repo_type="dataset"
)
print("βœ… Loaded precomputed embeddings from Hugging Face Hub")
self.job_embeddings = np.load(emb_path)
except Exception:
print("⚠️ No precomputed embeddings found. Generating now...")
self.job_embeddings = MODEL.encode(
self.jobs_texts,
batch_size=64,
show_progress_bar=True,
convert_to_numpy=True,
)
np.save("job_embeddings.npy", self.job_embeddings)
print("βœ… Saved embeddings to job_embeddings.npy")
def filter_top_jobs(self, resume_text: str, top_k: int = 500):
resume_vector = self.vectorizer.transform([resume_text])
cosine_similarities = linear_kernel(resume_vector, self.job_tfidf_matrix).flatten()
top_indices = cosine_similarities.argsort()[-top_k:][::-1]
return (
self.jobs_df.iloc[top_indices].reset_index(drop=True),
self.job_embeddings[top_indices],
)
def recommend_jobs(self, resume_text: str, top_n: int = 20):
filtered_jobs_df, filtered_embeddings = self.filter_top_jobs(resume_text)
resume_embedding = MODEL.encode(resume_text, convert_to_numpy=True).reshape(1, -1)
similarities = cosine_similarity(resume_embedding, filtered_embeddings)[0]
top_indices = similarities.argsort()[-top_n:][::-1]
recommendations = []
for idx in top_indices:
job = filtered_jobs_df.iloc[idx]
recommendations.append({
"job_id": job.get("job_id", ""),
"position": job.get("position", "N/A"),
"workplace": job.get("workplace", "N/A"),
"formatted_work_type": job.get("formatted_work_type", "N/A"),
"remote_allowed": job.get("remote_allowed", "N/A"),
"salary_range": f"{job.get('min_salary','')} - {job.get('max_salary','')} {job.get('currency','')} ({job.get('pay_period','')})",
"experience_level": job.get("formatted_experience_level", "N/A"),
"job_role_and_duties": job.get("job_role_and_duties", "N/A"),
"skills": job.get("requisite_skill", "N/A"),
"benefits": job.get("benefits", "N/A"),
"location": f"{job.get('city','')}, {job.get('state','')}, {job.get('country','')}",
"company_size": job.get("company_size", "N/A"),
"employee_count": job.get("employee_count", "N/A"),
"company_website": job.get("company_website", "N/A"),
"apply_link": job.get("apply_link", job.get("job_posting_url", "")),
"similarity": float(similarities[idx]),
})
return recommendations