File size: 6,751 Bytes
4813a0f
 
 
 
 
 
 
 
 
 
 
 
250e7d4
4813a0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import gradio as gr
import PyPDF2
import docx2txt
import logging

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# ----------------------------------------------------------------------------
# 1) Utility Functions: Parsing & Preprocessing
# ----------------------------------------------------------------------------

def extract_text_from_pdf(file_obj):
    """Extract all text from a PDF file object."""
    text_content = []
    try:
        logging.info("Loading PDF file.")
        pdf_reader = PyPDF2.PdfReader(file_obj)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text_content.append(page_text)
        extracted_text = "\n".join(text_content)
        logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
        
        print(extracted_text)  # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading PDF: {e}")
        return f"Error reading PDF: {e}"

def extract_text_from_docx(file_path):
    """Extract all text from a DOCX file on disk."""
    try:
        logging.info("Loading DOCX file.")
        extracted_text = docx2txt.process(file_path)
        logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
        
        print(extracted_text) # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading DOCX: {e}")
        return f"Error reading DOCX: {e}"

def extract_text_from_txt(file_obj):
    """Extract all text from a TXT file object."""
    try:
        logging.info("Loading TXT file.")
        extracted_text = file_obj.read().decode("utf-8", errors="ignore")
        logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
        
        print(extracted_text) # Print the extracted text
        
        return extracted_text
    except Exception as e:
        logging.error(f"Error reading TXT: {e}")
        return f"Error reading TXT: {e}"

def preprocess_text(text):
    """
    Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
    and then rejoin into a clean string.
    """
    logging.info("Preprocessing text.")
    text = str(text).lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    processed_text = " ".join(filtered_tokens)
    logging.info(f"Preprocessed text: {processed_text[:500]}...")
    return processed_text

# ----------------------------------------------------------------------------
# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
# ----------------------------------------------------------------------------

def rank_resumes_with_tfidf(job_description: str, resumes: dict):
    logging.info("Ranking resumes using TF-IDF.")
    preprocessed_jd = preprocess_text(job_description)
    preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
    corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
    filenames = list(preprocessed_resumes.keys())
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    jd_vector = tfidf_matrix[0:1]
    resume_vectors = tfidf_matrix[1:]
    similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
    results = list(zip(filenames, similarities))
    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    logging.info(f"Ranking completed: {results_sorted}")
    return results_sorted

# ----------------------------------------------------------------------------
# 3) Gradio Callback Function
# ----------------------------------------------------------------------------

def analyze_cvs(job_description, cv_files):
    logging.info("Starting CV analysis.")
    resumes_data = {}

    for uploaded_file in cv_files:
      
        filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
      
        file_ext = os.path.splitext(filename)[1].lower()
        temp_filepath = None

        try:
            logging.info(f"Processing file: {filename}")
            if file_ext == ".pdf":
                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
                    file_content = extract_text_from_pdf(f)
            elif file_ext == ".txt":
                with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
                    file_content = extract_text_from_txt(f)
            elif file_ext == ".docx":
                file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
            else:
                file_content = "Unsupported file type."
        except Exception as e:
            logging.error(f"Error processing file: {e}")
            file_content = f"Error processing file: {e}"

        logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
        resumes_data[filename] = file_content

    ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
    display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
    logging.info("Analysis completed successfully.")
    return display_data

# ----------------------------------------------------------------------------
# 4) Gradio Interface
# ----------------------------------------------------------------------------

def create_gradio_interface():
    job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
    cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
    results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
    demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
    return demo

# ----------------------------------------------------------------------------
# 5) Main Script
# ----------------------------------------------------------------------------

if __name__ == "__main__":
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    app = create_gradio_interface()
    app.launch(server_name="0.0.0.0", server_port=7860, debug=True)