import spacy import streamlit as st import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import requests import re import pandas as pd import fitz # PyMuPDF import PyPDF2 from PIL import Image import pytesseract from pdf2image import convert_from_path import docx # For DOCX processing import io import os # Download necessary NLTK data nltk.download('punkt') nltk.download('stopwords') # Download and load the SpaCy model if not already available try: nlp = spacy.load("en_core_web_sm") except OSError: from spacy.cli import download download("en_core_web_sm") # Downloads the model nlp = spacy.load("en_core_web_sm") # Loads the model after downloading # Function for PyMuPDF text extraction def extract_text_with_pymupdf(pdf_file): """Extract text using PyMuPDF (fitz)""" text = "" pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") for page_num in range(len(pdf_document)): page = pdf_document[page_num] text += page.get_text() pdf_document.close() return text # Function for PyPDF2 text extraction def extract_text_with_pypdf2(pdf_file): """Extract text using PyPDF2""" text = "" pdf_reader = PyPDF2.PdfReader(pdf_file) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text # Function for OCR extraction (for scanned PDFs) def extract_text_with_ocr(pdf_file): """Extract text using OCR (for scanned PDFs)""" text = "" images = convert_from_path(pdf_file.name) # Convert PDF pages to images for image in images: text += pytesseract.image_to_string(image) return text # Function for DOCX text extraction def extract_text_from_docx(docx_file): """Extract text from a DOCX file.""" doc = docx.Document(docx_file) text = '\n'.join([para.text for para in doc.paragraphs]) return text # Unified PDF extraction function def extract_text_from_pdf(pdf_file): """Extract text using multiple methods.""" text = "" # Attempt PyMuPDF extraction try: text = extract_text_with_pymupdf(pdf_file) if text.strip(): # If PyMuPDF returns meaningful text return text except Exception as e: print(f"Error with PyMuPDF: {e}") # Reset file pointer pdf_file.seek(0) # Attempt PyPDF2 extraction try: text = extract_text_with_pypdf2(pdf_file) if text.strip(): # If PyPDF2 returns meaningful text return text except Exception as e: print(f"Error with PyPDF2: {e}") # Reset file pointer pdf_file.seek(0) # Attempt OCR as a last resort try: text = extract_text_with_ocr(pdf_file) if text.strip(): # If OCR returns meaningful text return text except Exception as e: print(f"Error with OCR: {e}") return text # Return empty text if all methods fail # Function to clean and normalize text def clean_and_normalize_text(text): """Clean and normalize the resume/job description text.""" # Tokenization tokens = word_tokenize(text) # Lowercasing and removing non-alphabetical tokens tokens = [word.lower() for word in tokens if word.isalpha()] # Removing stopwords using NLTK stop_words = set(stopwords.words("english")) filtered_tokens = [word for word in tokens if word not in stop_words] # Lemmatization using SpaCy doc = nlp(' '.join(filtered_tokens)) lemmatized_tokens = [token.lemma_ for token in doc] # Reconstruct the cleaned text cleaned_text = ' '.join(lemmatized_tokens) # Optionally, remove extra spaces or characters cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return cleaned_text # Function for Named Entity Recognition (NER) def extract_named_entities(text): """Extract named entities from text using SpaCy.""" doc = nlp(text) # Extract named entities entities = [(ent.text, ent.label_) for ent in doc.ents] return entities # Function to analyze the resume and job description using Gemini 1.5 Flash model def analyze_documents(resume_text, job_description): """Analyze resume text against the job description using Gemini 1.5 Flash.""" custom_prompt = f""" Please analyze the following resume in the context of the job description provided. For the match percentage, please consider: - The relevance of the hard skills mentioned. - The match of experiences and achievements listed in the resume. - Only return a 100% match if all critical skills, experiences, and keywords align well and meaningfully with the job description. Job Description: {job_description} Resume: {resume_text} """ API_KEY = os.getenv("GEMINI_API_KEY") # Ensure you set this environment variable securely if not API_KEY: return {"Match Percentage": "API Key Missing", "Recommendations": "Please set the GEMINI_API_KEY environment variable."} url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={API_KEY}" headers = {'Content-Type': 'application/json'} data = { "contents": [ {"role": "user", "parts": [{"text": custom_prompt}]} ] } response = requests.post(url, headers=headers, json=data) if response.status_code == 200: try: result = response.json() # Parse the response as needed. This is a placeholder. return { "Match Percentage": result.get('choices', [{}])[0].get('text', 'N/A').strip(), "Recommendations": "Placeholder for actual recommendations." } except ValueError: return {"Match Percentage": "Error", "Recommendations": "Failed to parse response."} else: return {"Match Percentage": "Error", "Recommendations": f"API request failed with status code {response.status_code}."} # Streamlit app configuration st.set_page_config(page_title="ATS Resume Evaluation System", layout="wide") # Header Section st.markdown( """ """, unsafe_allow_html=True ) st.markdown('