from flask import Flask, render_template, request, jsonify from flask_cors import CORS import fitz # PyMuPDF for PDF text extraction import spacy from transformers import T5Tokenizer, T5ForConditionalGeneration import torch import os app = Flask(__name__) CORS(app) # ===== Load Custom NER Model ===== try: nlp = spacy.load("custom_ner_model") # Load your custom-trained NER model print("Custom NER model loaded successfully.") except Exception as e: print(f"Error loading custom NER model: {e}") exit() # ===== Load T5 Model for Job Title Prediction ===== tokenizer = T5Tokenizer.from_pretrained("t5-base") model = T5ForConditionalGeneration.from_pretrained("t5-base") # Load model weights device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.load_state_dict(torch.load("best.pth", map_location=device)) model.eval() model.to(device) print("T5 model for job title prediction loaded successfully.") # ===== Helper Functions ===== # Extract text from PDF def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text # Extract entities using Custom NER def extract_entities(text): text=text.replace("\\n","\n") doc = nlp(text) # Process text with custom NER extracted_data = {} for ent in doc.ents: # Use only relevant labels if ent.label_ in ["SKILL", "ROLE", "LOCATION", "AREA", "INDUSTRY"]: if ent.label_ not in extracted_data: extracted_data[ent.label_] = [] if ent.text not in extracted_data[ent.label_]: extracted_data[ent.label_].append(ent.text) # Format results as comma-separated strings for key in extracted_data: extracted_data[key] = ", ".join(extracted_data[key]) return extracted_data # Predict job title using T5 model def predict_job_title(skills, area,roles,location,industry): input_text = f"Skills: {skills}; \nRole: {roles}; \nLocation: {location}; \nArea: {area}; \nIndustry: {industry}" inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device) with torch.no_grad(): outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True) predicted_job_title = tokenizer.decode(outputs[0], skip_special_tokens=True) return predicted_job_title # ===== Flask Routes ===== @app.route('/') def home(): return render_template('index.html') # Default home page @app.route('/predict', methods=['POST']) def predict(): if 'resume' not in request.files: return jsonify({'error': 'No file uploaded'}), 400 file = request.files['resume'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 if not file.filename.endswith('.pdf'): return jsonify({'error': 'Please upload a PDF file'}), 400 try: # Step 1: Extract text from PDF resume_text = extract_text_from_pdf(file) # Step 2: Extract entities using Custom NER extracted_data = extract_entities(resume_text) # Step 3: Prepare input for T5 prediction skills = extracted_data.get("SKILL", "") area = extracted_data.get("AREA", "") roles = extracted_data.get("ROLE", "") location = extracted_data.get("LOCATION", "") industry = extracted_data.get("INDUSTRY", "") # Step 4: Predict job title predicted_title = predict_job_title(skills, area,roles,location,industry) # Step 5: Return response response = { 'success': True, 'predicted_title': predicted_title.split(";")[0], 'extracted_skills': extracted_data.get("SKILL", ""), 'roles': extracted_data.get("ROLE", ""), 'locations': extracted_data.get("LOCATION", ""), 'area': extracted_data.get("AREA", ""), 'industry': extracted_data.get("INDUSTRY", "") } return jsonify(response) except Exception as e: return jsonify({'error': str(e)}), 500 if __name__ == '__main__': from waitress import serve print("Starting Flask app...") serve(app, host="0.0.0.0", port=7860)