Spaces:

DreamStream-1
/

New-1

Build error

App Files Files Community

DreamStream-1 commited on Nov 24, 2024

Commit

c04a9f1

verified ·

1 Parent(s): c3495c6

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -130

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import google.generativeai as genai
 import spacy
 import nltk
@@ -10,137 +11,219 @@ from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 import fitz  # PyMuPDF
-# Initialize NLTK resources
-nltk.download('punkt')
-nltk.download('stopwords')
-nltk.download('wordnet')
-stop_words = set(stopwords.words('english'))
-lemmatizer = WordNetLemmatizer()
-# Load the spaCy model for NER
-try:
-    nlp = spacy.load("en_core_web_sm")
-except:
-    import subprocess
-    subprocess.run(["python3", "-m", "spacy", "download", "en_core_web_sm"])
-    nlp = spacy.load("en_core_web_sm")
-# Fetch the Google API key from Hugging Face secrets
-google_api_key = os.getenv("GOOGLE_API_KEY")
-# Check if the key is being fetched correctly
-if google_api_key is None:
-    raise ValueError("Google API Key is missing from environment variables.")
-# Configure the Gemini API with the secret key
-genai.configure(api_key=google_api_key)
-# Extract text from PDF using PyMuPDF
-def extract_text_from_pdf(pdf_path):
-    """Extract text from a PDF file using PyMuPDF."""
-    doc = fitz.open(pdf_path)  # Open the PDF file
-    text = ""
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)  # Load each page
-        text += page.get_text("text")  # Extract text
-    return text
-# Extract text from DOCX files
-def extract_text_from_docx(docx_path):
-    """Extract text from a DOCX file."""
-    import docx
-    doc = docx.Document(docx_path)
-    text = "\n".join([para.text for para in doc.paragraphs])
-    return text
-# Preprocess text: Tokenization, stopword removal, lemmatization
-def preprocess_text(text):
-    """Preprocess the text by tokenizing, removing stopwords, and lemmatizing."""
-    text = text.lower()
-    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
-    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
-    tokens = word_tokenize(text)
-    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
-    return " ".join(tokens)
-# Extract named entities from text using spaCy
-def extract_named_entities(text):
-    """Extract named entities from text using spaCy's NER model."""
-    doc = nlp(text)
-    entities = [ent.text for ent in doc.ents]
-    return set(entities)
-# Calculate match percentage using fuzzy matching
-def calculate_match_percentage(resume_text, job_desc_text):
-    """Calculate match percentage between resume and job description using fuzzy matching."""
-    resume_text = preprocess_text(resume_text)
-    job_desc_text = preprocess_text(job_desc_text)
-    match = fuzz.partial_ratio(resume_text, job_desc_text)
-    return match
-# Use Gemini 1.5 Flash to analyze text and extract job-related insights
-def gemini_analysis(text):
-    """Use Gemini 1.5 Flash model to analyze text and extract insights like roles and skills."""
-    response = genai.generate_text(input_text=text)
-    return response.text
-# Process resumes and calculate match with job description
-def process_uploaded_resumes(resume_files: list, job_desc: str):
-    """Process multiple uploaded resumes and compare them against a job description."""
-    results = []
-    for resume in resume_files:
-        if resume.name.endswith('.pdf'):
-            resume_text = extract_text_from_pdf(resume.name)
-        elif resume.name.endswith('.docx'):
-            resume_text = extract_text_from_docx(resume.name)
-        else:
-            results.append({"Resume": resume.name, "Match Percentage": "Invalid File Type"})
-            continue
-        # Named Entity Recognition
-        resume_entities = extract_named_entities(resume_text)
-        job_desc_entities = extract_named_entities(job_desc)
-        # Compare named entities between resume and job description
-        entity_match = len(resume_entities.intersection(job_desc_entities)) / len(job_desc_entities) * 100
-        # Use Gemini 1.5 Flash model to analyze job-related insights (optional)
-        gemini_match = gemini_analysis(resume_text)
-        # Calculate match percentage based on fuzzy matching
-        match_percentage = calculate_match_percentage(resume_text, job_desc)
-        # Combine all results
-        results.append({
-            "Resume": resume.name,
-            "Match Percentage": match_percentage,
-            "Entity Match (%)": entity_match,
-            "Gemini Role Analysis": gemini_match
-        })
-    return pd.DataFrame(results)
 # Create the Gradio interface
-def create_gradio_interface():
-    """Creates and launches a Gradio interface for the ResumeAnalyzer."""
-    resume_input = gr.Files(label="Upload Resumes (PDF, DOCX)", type="filepath")  # Changed to 'filepath'
-    job_desc_input = gr.Textbox(label="Enter Job Description", lines=6, placeholder="Paste the job description here...")
-    output = gr.DataFrame(label="Resume Analysis Results")
-    interface = gr.Interface(
-        fn=process_uploaded_resumes,
-        inputs=[resume_input, job_desc_input],
-        outputs=[output],
-        title="Resume Match Analysis",
-        description="Upload resumes and provide a job description to see how well the resumes match the required skills, experience, and role.",
-        allow_flagging="never",  # Disable flagging (can be enabled if needed)
-    )
-    return interface
-# Initialize the ResumeAnalyzer and Gradio interface
-gradio_interface = create_gradio_interface()  # Create Gradio interface
-# Launch the Gradio interface
-gradio_interface.launch(share=True)  # share=True for generating a public URL to share

 import os
+import logging
 import google.generativeai as genai
 import spacy
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 import fitz  # PyMuPDF
+from typing import List, Dict, Set
+import docx
+from huggingface_hub import HfApi
+from pathlib import Path
+import tempfile
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class ResumeAnalyzer:
+    def __init__(self):
+        """Initialize the ResumeAnalyzer with required resources."""
+        self._initialize_nltk()
+        self._initialize_spacy()
+        self._setup_api_key()
+    def _initialize_nltk(self) -> None:
+        """Initialize NLTK resources safely."""
+        try:
+            # Download NLTK data to the current directory
+            nltk.data.path.append(os.getcwd())
+            for resource in ['punkt', 'stopwords', 'wordnet']:
+                try:
+                    nltk.data.find(f'tokenizers/{resource}')
+                except LookupError:
+                    nltk.download(resource, quiet=True)
+            self.stop_words = set(stopwords.words('english'))
+            self.lemmatizer = WordNetLemmatizer()
+        except Exception as e:
+            logger.error(f"Failed to initialize NLTK: {str(e)}")
+            raise
+    def _initialize_spacy(self) -> None:
+        """Initialize spaCy model safely."""
+        try:
+            self.nlp = spacy.load("en_core_web_sm")
+        except OSError:
+            logger.info("Downloading spaCy model...")
+            import subprocess
+            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
+            self.nlp = spacy.load("en_core_web_sm")
+    def _setup_api_key(self) -> None:
+        """Set up Google API key from Hugging Face Spaces secrets."""
+        try:
+            # Get API key from HF Spaces secrets
+            self.google_api_key = os.environ.get("GOOGLE_API_KEY")
+            if not self.google_api_key:
+                raise ValueError("GOOGLE_API_KEY not found in environment variables")
+            genai.configure(api_key=self.google_api_key)
+        except Exception as e:
+            logger.error(f"Failed to setup API key: {str(e)}")
+            raise
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        """Extract text from a PDF file."""
+        try:
+            with fitz.open(file_path) as doc:
+                text = " ".join(page.get_text("text") for page in doc)
+            return text
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF: {str(e)}")
+            return ""
+    def extract_text_from_docx(self, file_path: str) -> str:
+        """Extract text from a DOCX file."""
+        try:
+            doc = docx.Document(file_path)
+            return "\n".join(para.text for para in doc.paragraphs)
+        except Exception as e:
+            logger.error(f"Error extracting text from DOCX: {str(e)}")
+            return ""
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess the text."""
+        try:
+            text = text.lower()
+            text = re.sub(r'\s+', ' ', text)
+            text = re.sub(r'[^a-z0-9\s]', '', text)
+            tokens = word_tokenize(text)
+            tokens = [self.lemmatizer.lemmatize(word)
+                     for word in tokens
+                     if word not in self.stop_words]
+            return " ".join(tokens)
+        except Exception as e:
+            logger.error(f"Error in text preprocessing: {str(e)}")
+            return text
+    def extract_named_entities(self, text: str) -> Set[str]:
+        """Extract named entities from text."""
+        try:
+            # Limit text length to prevent memory issues
+            doc = self.nlp(text[:100000])
+            return {ent.text for ent in doc.ents}
+        except Exception as e:
+            logger.error(f"Error in named entity extraction: {str(e)}")
+            return set()
+    def calculate_match_percentage(self, resume_text: str, job_desc_text: str) -> float:
+        """Calculate the match percentage between resume and job description."""
+        try:
+            resume_text = self.preprocess_text(resume_text)
+            job_desc_text = self.preprocess_text(job_desc_text)
+            return fuzz.partial_ratio(resume_text, job_desc_text)
+        except Exception as e:
+            logger.error(f"Error calculating match percentage: {str(e)}")
+            return 0.0
+    def gemini_analysis(self, text: str) -> str:
+        """Analyze text using Gemini API."""
+        try:
+            prompt = f"""Analyze this resume text and provide a brief summary of key skills and experience:
+                        {text[:1000]}..."""
+            response = genai.generate_text(prompt=prompt)
+            return response.text
+        except Exception as e:
+            logger.error(f"Error in Gemini analysis: {str(e)}")
+            return "AI analysis failed"
+    def process_file(self, file: gr.File, job_desc: str) -> Dict:
+        """Process a single resume file."""
+        try:
+            # Create a temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.name).suffix) as temp_file:
+                temp_file.write(file.read())
+                temp_path = temp_file.name
+            # Extract text based on file type
+            if file.name.lower().endswith('.pdf'):
+                text = self.extract_text_from_pdf(temp_path)
+            elif file.name.lower().endswith('.docx'):
+                text = self.extract_text_from_docx(temp_path)
+            else:
+                return {"Resume": file.name, "Match Percentage": "Invalid File Type"}
+            # Clean up the temporary file
+            os.unlink(temp_path)
+            if not text.strip():
+                return {"Resume": file.name, "Match Percentage": "No text extracted"}
+            # Process the text
+            entities = self.extract_named_entities(text)
+            job_entities = self.extract_named_entities(job_desc)
+            entity_match = (
+                len(entities.intersection(job_entities)) / len(job_entities) * 100
+                if job_entities else 0
+            )
+            match_percentage = self.calculate_match_percentage(text, job_desc)
+            gemini_analysis = self.gemini_analysis(text)
+            return {
+                "Resume": file.name,
+                "Match Percentage": round(match_percentage, 2),
+                "Entity Match (%)": round(entity_match, 2),
+                "AI Analysis": gemini_analysis
+            }
+        except Exception as e:
+            logger.error(f"Error processing file {file.name}: {str(e)}")
+            return {"Resume": file.name, "Error": str(e)}
+    def process_uploaded_resumes(self, resume_files: List[gr.File], job_desc: str) -> pd.DataFrame:
+        """Process multiple resume files."""
+        if not resume_files:
+            return pd.DataFrame({"Message": ["Please upload at least one resume."]})
+        if not job_desc.strip():
+            return pd.DataFrame({"Message": ["Please provide a job description."]})
+        results = []
+        for file in resume_files:
+            result = self.process_file(file, job_desc)
+            results.append(result)
+        return pd.DataFrame(results)
 # Create the Gradio interface
+analyzer = ResumeAnalyzer()
+interface = gr.Interface(
+    fn=analyzer.process_uploaded_resumes,
+    inputs=[
+        gr.Files(
+            label="Upload Resumes (PDF or DOCX)",
+            file_types=[".pdf", ".docx"],
+            type="file"
+        ),
+        gr.Textbox(
+            label="Job Description",
+            placeholder="Paste the job description here...",
+            lines=6
+        )
+    ],
+    outputs=gr.DataFrame(label="Analysis Results"),
+    title="AI Resume Analyzer",
+    description="""
+    Upload resumes (PDF or DOCX) and provide a job description to see how well they match.
+    The analysis includes:
+    - Overall match percentage
+    - Key skills and experience matching
+    - AI-powered resume analysis
+    """,
+    allow_flagging="never",
+    theme=gr.themes.Soft()
+)
+# Launch the interface
+if __name__ == "__main__":
+    interface.launch()