| | |
| |
|
| | import os |
| | |
| |
|
| |
|
| | import streamlit as st |
| |
|
| | import google.generativeai as genai |
| | from PIL import Image |
| | import fitz |
| | from docx import Document |
| | import pytesseract |
| | import io |
| | import json |
| | from pathlib import Path |
| | from datetime import datetime |
| | import re |
| |
|
| | |
| | def extract_text_from_pdf(pdf_file): |
| | """Extract text from PDF, with OCR fallback for scanned PDFs.""" |
| | text_content = [] |
| | pdf_bytes = pdf_file.read() |
| | try: |
| | doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| | for page in doc: |
| | page_text = page.get_text() |
| | if not page_text.strip(): |
| | |
| | pix = page.get_pixmap() |
| | img = Image.open(io.BytesIO(pix.tobytes("png"))) |
| | page_text = pytesseract.image_to_string(img) |
| | text_content.append(page_text) |
| | return "\n".join(text_content) |
| | except Exception as e: |
| | st.error(f"PDF extraction error: {str(e)}") |
| | return "" |
| |
|
| | def extract_text_from_docx(docx_file): |
| | try: |
| | doc = Document(docx_file) |
| | return "\n".join([p.text for p in doc.paragraphs]) |
| | except Exception as e: |
| | st.error(f"DOCX extraction error: {str(e)}") |
| | return "" |
| |
|
| | def extract_text_from_image(image_file): |
| | try: |
| | image = Image.open(image_file) |
| | return pytesseract.image_to_string(image) |
| | except Exception as e: |
| | st.error(f"Image extraction error: {str(e)}") |
| | return "" |
| |
|
| | |
| | def parse_date(date_str): |
| | try: |
| | if date_str.lower() in ["present", "current", "now"]: |
| | return datetime.now() |
| | date_str = date_str.strip() |
| | formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"] |
| | for fmt in formats: |
| | try: |
| | return datetime.strptime(date_str, fmt) |
| | except: |
| | continue |
| | year_match = re.search(r"\b20\d{2}\b", date_str) |
| | if year_match: |
| | return datetime.strptime(year_match.group(), "%Y") |
| | return None |
| | except: |
| | return None |
| |
|
| | def calculate_experience(work_history): |
| | total_exp = 0 |
| | for job in work_history: |
| | duration = job.get("duration", "") |
| | if not duration: |
| | continue |
| | parts = re.split(r"\s*-\s*|\s+to\s+", duration) |
| | if len(parts) != 2: |
| | continue |
| | start, end = parse_date(parts[0]), parse_date(parts[1]) |
| | if start and end: |
| | years = (end.year - start.year) + (end.month - start.month)/12 |
| | total_exp += max(0, years) |
| | return round(total_exp, 1) |
| |
|
| | |
| | def parse_resume(file_uploaded, api_key): |
| | genai.configure(api_key=api_key) |
| | model = genai.GenerativeModel("gemini-1.5-flash") |
| |
|
| | prompt = """Extract the following information from this resume: |
| | 1. Summarize in 100 words, focus on skills, experience, qualifications. |
| | 2. Full Name |
| | 3. Email |
| | 4. Phone |
| | 5. Education (degree, institution, year, field) |
| | 6. Work experience with exact duration (e.g., Jan 2020 - Present) |
| | 7. Skills |
| | 8. LinkedIn URL |
| | |
| | Return as JSON: |
| | { |
| | "summary": "", "name": "", "email": "", "phone": "", |
| | "education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}], |
| | "work_experience": [{"company": "", "position": "", "duration": ""}], |
| | "skills": [], "linkedin": "" |
| | }""" |
| |
|
| | |
| | ext = Path(file_uploaded.name).suffix.lower() |
| | if ext == ".pdf": |
| | text_content = extract_text_from_pdf(file_uploaded) |
| | elif ext in [".docx", ".doc"]: |
| | text_content = extract_text_from_docx(file_uploaded) |
| | elif ext in [".jpg", ".jpeg", ".png"]: |
| | text_content = extract_text_from_image(file_uploaded) |
| | else: |
| | st.error(f"Unsupported file type: {ext}") |
| | return None |
| |
|
| | if not text_content.strip(): |
| | st.error("No text found in resume.") |
| | return None |
| |
|
| | |
| | try: |
| | response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") |
| | response_text = response.text |
| | st.text_area("Raw Response", response_text, height=200) |
| |
|
| | |
| | json_start = response_text.find("{") |
| | json_end = response_text.rfind("}") + 1 |
| | json_str = response_text[json_start:json_end] |
| | result = json.loads(json_str) |
| |
|
| | result["total_years_experience"] = calculate_experience(result.get("work_experience", [])) |
| | return result |
| | except Exception as e: |
| | st.error(f"Error parsing resume: {str(e)}") |
| | return None |
| |
|
| | |
| | def format_education(edu): |
| | parts = [] |
| | if edu.get("degree"): |
| | parts.append(edu["degree"]) |
| | if edu.get("field"): |
| | parts.append(f"in {edu['field']}") |
| | if edu.get("institution"): |
| | parts.append(f"from {edu['institution']}") |
| | if edu.get("year"): |
| | parts.append(f"({edu['year']})") |
| | if edu.get("gpa"): |
| | parts.append(f"- GPA: {edu['gpa']}") |
| | return " ".join(parts) |
| |
|
| | |
| | def main(): |
| | st.title("Resume Parser (PDF/DOCX/Image)") |
| | api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password") |
| | uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"]) |
| |
|
| | if uploaded_file and api_key: |
| | with st.spinner("Analyzing resume..."): |
| | result = parse_resume(uploaded_file, api_key) |
| |
|
| | if result: |
| | st.subheader("Extracted Information") |
| | st.text_area("Summary", result.get("summary",""), height=100) |
| | |
| | col1, col2, col3 = st.columns(3) |
| | col1.write("**Name:** "+result.get("name","")) |
| | col2.write("**Email:** "+result.get("email","")) |
| | col3.write("**Phone:** "+result.get("phone","")) |
| |
|
| | exp = result.get("total_years_experience",0) |
| | exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months" |
| | st.write("**Total Experience:**", exp_text) |
| |
|
| | st.subheader("Education") |
| | for edu in result.get("education", []): |
| | st.write("- "+format_education(edu)) |
| |
|
| | st.subheader("Work Experience") |
| | for w in result.get("work_experience", []): |
| | dur = f" ({w.get('duration','')})" if w.get("duration") else "" |
| | st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}") |
| |
|
| | st.subheader("Skills") |
| | for s in result.get("skills", []): |
| | st.write("- "+s) |
| |
|
| | st.write("**LinkedIn:**", result.get("linkedin","")) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|