Spaces:

ruslanmv
/

CV_Ranking

Sleeping

File size: 6,751 Bytes

import os
import gradio as gr
import PyPDF2
import docx2txt
import logging

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# ----------------------------------------------------------------------------
# 1) Utility Functions: Parsing & Preprocessing
# ----------------------------------------------------------------------------

def extract_text_from_pdf(file_obj):
    """Extract all text from a PDF file object."""
    text_content = []
    try:
        logging.info("Loading PDF file.")
        pdf_reader = PyPDF2.PdfReader(file_obj)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text_content.append(page_text)
        extracted_text = "\n".join(text_content)
        logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
        
        print(extracted_text)  # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading PDF: {e}")
        return f"Error reading PDF: {e}"

def extract_text_from_docx(file_path):
    """Extract all text from a DOCX file on disk."""
    try:
        logging.info("Loading DOCX file.")
        extracted_text = docx2txt.process(file_path)
        logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
        
        print(extracted_text) # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading DOCX: {e}")
        return f"Error reading DOCX: {e}"

def extract_text_from_txt(file_obj):
    """Extract all text from a TXT file object."""
    try:
        logging.info("Loading TXT file.")
        extracted_text = file_obj.read().decode("utf-8", errors="ignore")
        logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
        
        print(extracted_text) # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading TXT: {e}")
        return f"Error reading TXT: {e}"

def preprocess_text(text):
    """
    Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
    and then rejoin into a clean string.
    """
    logging.info("Preprocessing text.")
    text = str(text).lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    processed_text = " ".join(filtered_tokens)
    logging.info(f"Preprocessed text: {processed_text[:500]}...")
    return processed_text

# ----------------------------------------------------------------------------
# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
# ----------------------------------------------------------------------------

def rank_resumes_with_tfidf(job_description: str, resumes: dict):
    logging.info("Ranking resumes using TF-IDF.")
    preprocessed_jd = preprocess_text(job_description)
    preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
    corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
    filenames = list(preprocessed_resumes.keys())
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    jd_vector = tfidf_matrix[0:1]
    resume_vectors = tfidf_matrix[1:]
    similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
    results = list(zip(filenames, similarities))
    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    logging.info(f"Ranking completed: {results_sorted}")
    return results_sorted

# ----------------------------------------------------------------------------
# 3) Gradio Callback Function
# ----------------------------------------------------------------------------

def analyze_cvs(job_description, cv_files):
    logging.info("Starting CV analysis.")
    resumes_data = {}

    for uploaded_file in cv_files:
      
        filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
      
        file_ext = os.path.splitext(filename)[1].lower()
        temp_filepath = None

        try:
            logging.info(f"Processing file: {filename}")
            if file_ext == ".pdf":
                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
                    file_content = extract_text_from_pdf(f)
            elif file_ext == ".txt":
                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
                    file_content = extract_text_from_txt(f)
            elif file_ext == ".docx":
                file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
            else:
                file_content = "Unsupported file type."
        except Exception as e:
            logging.error(f"Error processing file: {e}")
            file_content = f"Error processing file: {e}"

        logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
        resumes_data[filename] = file_content

    ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
    display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
    logging.info("Analysis completed successfully.")
    return display_data

# ----------------------------------------------------------------------------
# 4) Gradio Interface
# ----------------------------------------------------------------------------

def create_gradio_interface():
    job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
    cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
    results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
    demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
    return demo

# ----------------------------------------------------------------------------
# 5) Main Script
# ----------------------------------------------------------------------------

if __name__ == "__main__":
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    app = create_gradio_interface()
    app.launch(server_name="0.0.0.0", server_port=7860, debug=True)