DreamStream-1 commited on
Commit
c04a9f1
·
verified ·
1 Parent(s): c3495c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -130
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import google.generativeai as genai
3
  import spacy
4
  import nltk
@@ -10,137 +11,219 @@ from nltk.corpus import stopwords
10
  from nltk.tokenize import word_tokenize
11
  from nltk.stem import WordNetLemmatizer
12
  import fitz # PyMuPDF
13
-
14
- # Initialize NLTK resources
15
- nltk.download('punkt')
16
- nltk.download('stopwords')
17
- nltk.download('wordnet')
18
- stop_words = set(stopwords.words('english'))
19
- lemmatizer = WordNetLemmatizer()
20
-
21
- # Load the spaCy model for NER
22
- try:
23
- nlp = spacy.load("en_core_web_sm")
24
- except:
25
- import subprocess
26
- subprocess.run(["python3", "-m", "spacy", "download", "en_core_web_sm"])
27
- nlp = spacy.load("en_core_web_sm")
28
-
29
- # Fetch the Google API key from Hugging Face secrets
30
- google_api_key = os.getenv("GOOGLE_API_KEY")
31
-
32
- # Check if the key is being fetched correctly
33
- if google_api_key is None:
34
- raise ValueError("Google API Key is missing from environment variables.")
35
-
36
- # Configure the Gemini API with the secret key
37
- genai.configure(api_key=google_api_key)
38
-
39
- # Extract text from PDF using PyMuPDF
40
- def extract_text_from_pdf(pdf_path):
41
- """Extract text from a PDF file using PyMuPDF."""
42
- doc = fitz.open(pdf_path) # Open the PDF file
43
- text = ""
44
- for page_num in range(len(doc)):
45
- page = doc.load_page(page_num) # Load each page
46
- text += page.get_text("text") # Extract text
47
- return text
48
-
49
- # Extract text from DOCX files
50
- def extract_text_from_docx(docx_path):
51
- """Extract text from a DOCX file."""
52
- import docx
53
- doc = docx.Document(docx_path)
54
- text = "\n".join([para.text for para in doc.paragraphs])
55
- return text
56
-
57
- # Preprocess text: Tokenization, stopword removal, lemmatization
58
- def preprocess_text(text):
59
- """Preprocess the text by tokenizing, removing stopwords, and lemmatizing."""
60
- text = text.lower()
61
- text = re.sub(r'\s+', ' ', text) # Normalize spaces
62
- text = re.sub(r'[^a-z0-9\s]', '', text) # Remove special characters
63
- tokens = word_tokenize(text)
64
- tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
65
- return " ".join(tokens)
66
-
67
- # Extract named entities from text using spaCy
68
- def extract_named_entities(text):
69
- """Extract named entities from text using spaCy's NER model."""
70
- doc = nlp(text)
71
- entities = [ent.text for ent in doc.ents]
72
- return set(entities)
73
-
74
- # Calculate match percentage using fuzzy matching
75
- def calculate_match_percentage(resume_text, job_desc_text):
76
- """Calculate match percentage between resume and job description using fuzzy matching."""
77
- resume_text = preprocess_text(resume_text)
78
- job_desc_text = preprocess_text(job_desc_text)
79
- match = fuzz.partial_ratio(resume_text, job_desc_text)
80
- return match
81
-
82
- # Use Gemini 1.5 Flash to analyze text and extract job-related insights
83
- def gemini_analysis(text):
84
- """Use Gemini 1.5 Flash model to analyze text and extract insights like roles and skills."""
85
- response = genai.generate_text(input_text=text)
86
- return response.text
87
-
88
- # Process resumes and calculate match with job description
89
- def process_uploaded_resumes(resume_files: list, job_desc: str):
90
- """Process multiple uploaded resumes and compare them against a job description."""
91
- results = []
92
- for resume in resume_files:
93
- if resume.name.endswith('.pdf'):
94
- resume_text = extract_text_from_pdf(resume.name)
95
- elif resume.name.endswith('.docx'):
96
- resume_text = extract_text_from_docx(resume.name)
97
- else:
98
- results.append({"Resume": resume.name, "Match Percentage": "Invalid File Type"})
99
- continue
100
-
101
- # Named Entity Recognition
102
- resume_entities = extract_named_entities(resume_text)
103
- job_desc_entities = extract_named_entities(job_desc)
104
-
105
- # Compare named entities between resume and job description
106
- entity_match = len(resume_entities.intersection(job_desc_entities)) / len(job_desc_entities) * 100
107
 
108
- # Use Gemini 1.5 Flash model to analyze job-related insights (optional)
109
- gemini_match = gemini_analysis(resume_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Calculate match percentage based on fuzzy matching
112
- match_percentage = calculate_match_percentage(resume_text, job_desc)
113
-
114
- # Combine all results
115
- results.append({
116
- "Resume": resume.name,
117
- "Match Percentage": match_percentage,
118
- "Entity Match (%)": entity_match,
119
- "Gemini Role Analysis": gemini_match
120
- })
121
-
122
- return pd.DataFrame(results)
123
 
124
  # Create the Gradio interface
125
- def create_gradio_interface():
126
- """Creates and launches a Gradio interface for the ResumeAnalyzer."""
127
- resume_input = gr.Files(label="Upload Resumes (PDF, DOCX)", type="filepath") # Changed to 'filepath'
128
- job_desc_input = gr.Textbox(label="Enter Job Description", lines=6, placeholder="Paste the job description here...")
129
- output = gr.DataFrame(label="Resume Analysis Results")
130
-
131
- interface = gr.Interface(
132
- fn=process_uploaded_resumes,
133
- inputs=[resume_input, job_desc_input],
134
- outputs=[output],
135
- title="Resume Match Analysis",
136
- description="Upload resumes and provide a job description to see how well the resumes match the required skills, experience, and role.",
137
- allow_flagging="never", # Disable flagging (can be enabled if needed)
138
- )
139
-
140
- return interface
141
-
142
- # Initialize the ResumeAnalyzer and Gradio interface
143
- gradio_interface = create_gradio_interface() # Create Gradio interface
144
-
145
- # Launch the Gradio interface
146
- gradio_interface.launch(share=True) # share=True for generating a public URL to share
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import logging
3
  import google.generativeai as genai
4
  import spacy
5
  import nltk
 
11
  from nltk.tokenize import word_tokenize
12
  from nltk.stem import WordNetLemmatizer
13
  import fitz # PyMuPDF
14
+ from typing import List, Dict, Set
15
+ import docx
16
+ from huggingface_hub import HfApi
17
+ from pathlib import Path
18
+ import tempfile
19
+
20
+ # Configure logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ class ResumeAnalyzer:
28
+ def __init__(self):
29
+ """Initialize the ResumeAnalyzer with required resources."""
30
+ self._initialize_nltk()
31
+ self._initialize_spacy()
32
+ self._setup_api_key()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def _initialize_nltk(self) -> None:
35
+ """Initialize NLTK resources safely."""
36
+ try:
37
+ # Download NLTK data to the current directory
38
+ nltk.data.path.append(os.getcwd())
39
+ for resource in ['punkt', 'stopwords', 'wordnet']:
40
+ try:
41
+ nltk.data.find(f'tokenizers/{resource}')
42
+ except LookupError:
43
+ nltk.download(resource, quiet=True)
44
+ self.stop_words = set(stopwords.words('english'))
45
+ self.lemmatizer = WordNetLemmatizer()
46
+ except Exception as e:
47
+ logger.error(f"Failed to initialize NLTK: {str(e)}")
48
+ raise
49
+
50
+ def _initialize_spacy(self) -> None:
51
+ """Initialize spaCy model safely."""
52
+ try:
53
+ self.nlp = spacy.load("en_core_web_sm")
54
+ except OSError:
55
+ logger.info("Downloading spaCy model...")
56
+ import subprocess
57
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
58
+ self.nlp = spacy.load("en_core_web_sm")
59
+
60
+ def _setup_api_key(self) -> None:
61
+ """Set up Google API key from Hugging Face Spaces secrets."""
62
+ try:
63
+ # Get API key from HF Spaces secrets
64
+ self.google_api_key = os.environ.get("GOOGLE_API_KEY")
65
+ if not self.google_api_key:
66
+ raise ValueError("GOOGLE_API_KEY not found in environment variables")
67
+ genai.configure(api_key=self.google_api_key)
68
+ except Exception as e:
69
+ logger.error(f"Failed to setup API key: {str(e)}")
70
+ raise
71
+
72
+ def extract_text_from_pdf(self, file_path: str) -> str:
73
+ """Extract text from a PDF file."""
74
+ try:
75
+ with fitz.open(file_path) as doc:
76
+ text = " ".join(page.get_text("text") for page in doc)
77
+ return text
78
+ except Exception as e:
79
+ logger.error(f"Error extracting text from PDF: {str(e)}")
80
+ return ""
81
+
82
+ def extract_text_from_docx(self, file_path: str) -> str:
83
+ """Extract text from a DOCX file."""
84
+ try:
85
+ doc = docx.Document(file_path)
86
+ return "\n".join(para.text for para in doc.paragraphs)
87
+ except Exception as e:
88
+ logger.error(f"Error extracting text from DOCX: {str(e)}")
89
+ return ""
90
+
91
+ def preprocess_text(self, text: str) -> str:
92
+ """Preprocess the text."""
93
+ try:
94
+ text = text.lower()
95
+ text = re.sub(r'\s+', ' ', text)
96
+ text = re.sub(r'[^a-z0-9\s]', '', text)
97
+ tokens = word_tokenize(text)
98
+ tokens = [self.lemmatizer.lemmatize(word)
99
+ for word in tokens
100
+ if word not in self.stop_words]
101
+ return " ".join(tokens)
102
+ except Exception as e:
103
+ logger.error(f"Error in text preprocessing: {str(e)}")
104
+ return text
105
+
106
+ def extract_named_entities(self, text: str) -> Set[str]:
107
+ """Extract named entities from text."""
108
+ try:
109
+ # Limit text length to prevent memory issues
110
+ doc = self.nlp(text[:100000])
111
+ return {ent.text for ent in doc.ents}
112
+ except Exception as e:
113
+ logger.error(f"Error in named entity extraction: {str(e)}")
114
+ return set()
115
+
116
+ def calculate_match_percentage(self, resume_text: str, job_desc_text: str) -> float:
117
+ """Calculate the match percentage between resume and job description."""
118
+ try:
119
+ resume_text = self.preprocess_text(resume_text)
120
+ job_desc_text = self.preprocess_text(job_desc_text)
121
+ return fuzz.partial_ratio(resume_text, job_desc_text)
122
+ except Exception as e:
123
+ logger.error(f"Error calculating match percentage: {str(e)}")
124
+ return 0.0
125
+
126
+ def gemini_analysis(self, text: str) -> str:
127
+ """Analyze text using Gemini API."""
128
+ try:
129
+ prompt = f"""Analyze this resume text and provide a brief summary of key skills and experience:
130
+ {text[:1000]}..."""
131
+ response = genai.generate_text(prompt=prompt)
132
+ return response.text
133
+ except Exception as e:
134
+ logger.error(f"Error in Gemini analysis: {str(e)}")
135
+ return "AI analysis failed"
136
+
137
+ def process_file(self, file: gr.File, job_desc: str) -> Dict:
138
+ """Process a single resume file."""
139
+ try:
140
+ # Create a temporary file
141
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.name).suffix) as temp_file:
142
+ temp_file.write(file.read())
143
+ temp_path = temp_file.name
144
+
145
+ # Extract text based on file type
146
+ if file.name.lower().endswith('.pdf'):
147
+ text = self.extract_text_from_pdf(temp_path)
148
+ elif file.name.lower().endswith('.docx'):
149
+ text = self.extract_text_from_docx(temp_path)
150
+ else:
151
+ return {"Resume": file.name, "Match Percentage": "Invalid File Type"}
152
+
153
+ # Clean up the temporary file
154
+ os.unlink(temp_path)
155
+
156
+ if not text.strip():
157
+ return {"Resume": file.name, "Match Percentage": "No text extracted"}
158
+
159
+ # Process the text
160
+ entities = self.extract_named_entities(text)
161
+ job_entities = self.extract_named_entities(job_desc)
162
+
163
+ entity_match = (
164
+ len(entities.intersection(job_entities)) / len(job_entities) * 100
165
+ if job_entities else 0
166
+ )
167
+
168
+ match_percentage = self.calculate_match_percentage(text, job_desc)
169
+ gemini_analysis = self.gemini_analysis(text)
170
+
171
+ return {
172
+ "Resume": file.name,
173
+ "Match Percentage": round(match_percentage, 2),
174
+ "Entity Match (%)": round(entity_match, 2),
175
+ "AI Analysis": gemini_analysis
176
+ }
177
+
178
+ except Exception as e:
179
+ logger.error(f"Error processing file {file.name}: {str(e)}")
180
+ return {"Resume": file.name, "Error": str(e)}
181
+
182
+ def process_uploaded_resumes(self, resume_files: List[gr.File], job_desc: str) -> pd.DataFrame:
183
+ """Process multiple resume files."""
184
+ if not resume_files:
185
+ return pd.DataFrame({"Message": ["Please upload at least one resume."]})
186
 
187
+ if not job_desc.strip():
188
+ return pd.DataFrame({"Message": ["Please provide a job description."]})
189
+
190
+ results = []
191
+ for file in resume_files:
192
+ result = self.process_file(file, job_desc)
193
+ results.append(result)
194
+
195
+ return pd.DataFrame(results)
 
 
 
196
 
197
  # Create the Gradio interface
198
+ analyzer = ResumeAnalyzer()
199
+
200
+ interface = gr.Interface(
201
+ fn=analyzer.process_uploaded_resumes,
202
+ inputs=[
203
+ gr.Files(
204
+ label="Upload Resumes (PDF or DOCX)",
205
+ file_types=[".pdf", ".docx"],
206
+ type="file"
207
+ ),
208
+ gr.Textbox(
209
+ label="Job Description",
210
+ placeholder="Paste the job description here...",
211
+ lines=6
212
+ )
213
+ ],
214
+ outputs=gr.DataFrame(label="Analysis Results"),
215
+ title="AI Resume Analyzer",
216
+ description="""
217
+ Upload resumes (PDF or DOCX) and provide a job description to see how well they match.
218
+ The analysis includes:
219
+ - Overall match percentage
220
+ - Key skills and experience matching
221
+ - AI-powered resume analysis
222
+ """,
223
+ allow_flagging="never",
224
+ theme=gr.themes.Soft()
225
+ )
226
+
227
+ # Launch the interface
228
+ if __name__ == "__main__":
229
+ interface.launch()