|
import streamlit as st |
|
import google.generativeai as genai |
|
from PIL import Image |
|
import fitz |
|
from docx import Document |
|
import json |
|
from pathlib import Path |
|
from datetime import datetime |
|
import re |
|
import pytesseract |
|
import io |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
"""Extract text from uploaded PDF file.""" |
|
text_content = [] |
|
try: |
|
pdf_bytes = pdf_file.read() |
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
for page_num in range(len(doc)): |
|
page = doc[page_num] |
|
text_content.append(page.get_text()) |
|
return "\n".join(text_content) |
|
except Exception as e: |
|
st.error(f"Error in PDF extraction: {str(e)}") |
|
return "" |
|
|
|
def extract_text_from_docx(docx_file): |
|
"""Extract text from uploaded DOCX file.""" |
|
try: |
|
doc = Document(docx_file) |
|
text_content = [] |
|
for paragraph in doc.paragraphs: |
|
text_content.append(paragraph.text) |
|
return "\n".join(text_content) |
|
except Exception as e: |
|
st.error(f"Error in DOCX extraction: {str(e)}") |
|
return "" |
|
|
|
def parse_date(date_str): |
|
"""Parse date from various formats.""" |
|
try: |
|
|
|
if date_str.lower() in ['present', 'current', 'now']: |
|
return datetime.now() |
|
|
|
date_str = date_str.strip() |
|
|
|
formats = [ |
|
'%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y', |
|
'%Y/%m', '%Y-%m' |
|
] |
|
|
|
for fmt in formats: |
|
try: |
|
return datetime.strptime(date_str, fmt) |
|
except ValueError: |
|
continue |
|
|
|
year_match = re.search(r'\b20\d{2}\b', date_str) |
|
if year_match: |
|
return datetime.strptime(year_match.group(), '%Y') |
|
|
|
return None |
|
except Exception: |
|
return None |
|
|
|
def calculate_experience(work_history): |
|
"""Calculate total years of experience from work history.""" |
|
total_experience = 0 |
|
current_year = datetime.now().year |
|
|
|
for job in work_history: |
|
duration = job.get('duration', '') |
|
if not duration: |
|
continue |
|
|
|
parts = re.split(r'\s*-\s*|\s+to\s+', duration) |
|
if len(parts) != 2: |
|
continue |
|
|
|
start_date = parse_date(parts[0]) |
|
end_date = parse_date(parts[1]) |
|
|
|
if start_date and end_date: |
|
years = (end_date.year - start_date.year) + \ |
|
(end_date.month - start_date.month) / 12 |
|
total_experience += max(0, years) |
|
|
|
return round(total_experience, 1) |
|
|
|
def parse_resume(file_uploaded, api_key): |
|
"""Parse resume and extract information.""" |
|
genai.configure(api_key=api_key) |
|
model = genai.GenerativeModel('gemini-1.5-flash') |
|
|
|
prompt = """Extract the following information from this resume: |
|
1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications |
|
2. Full Name |
|
3. Email Address |
|
4. Phone Number |
|
5. Education History (including degree, institution, graduation year, and field of study) |
|
6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020") |
|
7. Skills |
|
8. LinkedIn Profile URL |
|
Return the information in this JSON format: |
|
{ |
|
"summary": "", |
|
"name": "", |
|
"email": "", |
|
"phone": "", |
|
"education": [ |
|
{ |
|
"degree": "", |
|
"institution": "", |
|
"year": "", |
|
"field": "", |
|
"gpa": "" |
|
} |
|
], |
|
"work_experience": [ |
|
{ |
|
"company": "", |
|
"position": "", |
|
"duration": "" |
|
} |
|
], |
|
"skills": [], |
|
"linkedin": "" |
|
} |
|
For skills include tools and technologies in output if present any in resume. |
|
For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending. |
|
Only return the JSON object, nothing else. If any field is not found, leave it empty.""" |
|
|
|
try: |
|
file_extension = Path(file_uploaded.name).suffix.lower() |
|
|
|
if file_extension == '.pdf': |
|
text_content = extract_text_from_pdf(file_uploaded) |
|
elif file_extension in ['.docx', '.doc']: |
|
text_content = extract_text_from_docx(file_uploaded) |
|
elif file_extension in ['.jpg', '.jpeg', '.png']: |
|
image = Image.open(file_uploaded) |
|
text_content = pytesseract.image_to_string(image) |
|
else: |
|
st.error(f"Unsupported file format: {file_extension}") |
|
return None |
|
|
|
response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") |
|
|
|
try: |
|
response_text = response.text |
|
json_start = response_text.find('{') |
|
json_end = response_text.rfind('}') + 1 |
|
json_str = response_text[json_start:json_end] |
|
|
|
result = json.loads(json_str) |
|
total_exp = calculate_experience(result.get('work_experience', [])) |
|
result['total_years_experience'] = total_exp |
|
|
|
return result |
|
except json.JSONDecodeError as e: |
|
st.error(f"Error parsing response: {str(e)}") |
|
return None |
|
|
|
except Exception as e: |
|
st.error(f"Error processing resume: {str(e)}") |
|
return None |
|
|
|
def format_education(edu): |
|
"""Format education details for display.""" |
|
parts = [] |
|
if edu.get('degree'): |
|
parts.append(edu['degree']) |
|
if edu.get('field'): |
|
parts.append(f"in {edu['field']}") |
|
if edu.get('institution'): |
|
parts.append(f"from {edu['institution']}") |
|
if edu.get('year'): |
|
parts.append(f"({edu['year']})") |
|
if edu.get('gpa') and edu['gpa'].strip(): |
|
parts.append(f"- GPA: {edu['gpa']}") |
|
return " ".join(parts) |
|
|
|
def main(): |
|
st.title("Resume Parser") |
|
st.write("Upload a resume (PDF, DOCX, or Image) to extract information") |
|
|
|
|
|
api_key = st.secrets["GEMINI_API_KEY"] if "GEMINI_API_KEY" in st.secrets else st.text_input("Enter Gemini API Key", type="password") |
|
|
|
uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"]) |
|
|
|
if uploaded_file and api_key: |
|
with st.spinner('Analyzing resume...'): |
|
result = parse_resume(uploaded_file, api_key) |
|
|
|
if result: |
|
st.subheader("Extracted Information") |
|
|
|
|
|
st.text_area("Summary", result.get('summary', 'Not found'), height=100) |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
with col1: |
|
st.write("*Name:*", result.get('name', 'Not found')) |
|
with col2: |
|
st.write("*Email:*", result.get('email', 'Not found')) |
|
with col3: |
|
st.write("*Phone:*", result.get('phone', 'Not found')) |
|
|
|
|
|
total_exp = result.get('total_years_experience', 0) |
|
exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months" |
|
st.write("*Total Experience:*", exp_text) |
|
|
|
|
|
st.subheader("Education") |
|
if result.get('education'): |
|
for edu in result['education']: |
|
st.write(f"- {format_education(edu)}") |
|
else: |
|
st.write("No education information found") |
|
|
|
|
|
st.subheader("Work Experience") |
|
if result.get('work_experience'): |
|
for exp in result['work_experience']: |
|
duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else "" |
|
st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}") |
|
else: |
|
st.write("No work experience found") |
|
|
|
|
|
st.subheader("Skills:") |
|
if result.get('skills'): |
|
for skill in result['skills']: |
|
st.write(f"- {skill}") |
|
else: |
|
st.write("- No skills found") |
|
|
|
|
|
st.write("*LinkedIn Profile:*", result.get('linkedin', 'Not found')) |
|
|
|
if __name__ == "__main__": |
|
main() |