resume_parser / app.py
Prernas19's picture
Update app.py
51593d8 verified
import gradio as gr
import spacy
import re
import pdfplumber
import docx
import nltk
from nltk.corpus import words
from spacy.cli import download
# Download the SpaCy model if it doesn't exist
download("en_core_web_sm")
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# Set of English words
nltk.download('words', quiet=True)
english_words = set(words.words())
def extract_text(file):
try:
if file.name.endswith('.pdf'):
return extract_text_from_pdf(file)
elif file.name.endswith('.docx'):
return extract_text_from_docx(file)
else:
return "Unsupported file format"
except Exception as e:
return f"Error extracting text: {str(e)}"
def extract_text_from_pdf(file):
text = ''
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() or ''
return text
def extract_text_from_docx(file):
doc = docx.Document(file)
return "\n".join([para.text for para in doc.paragraphs])
def extract_companies(text):
doc = nlp(text)
companies = []
company_pattern = re.compile(
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
for ent in doc.ents:
if ent.label_ == "ORG" and company_pattern.search(ent.text):
companies.append(ent.text)
# Join companies with new lines
return "\n".join(companies)
def extract_colleges(text):
doc = nlp(text)
colleges = []
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
for sent in doc.sents:
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
for edu in edu_ents:
colleges.append(edu.text)
# Join colleges with new lines
return "\n".join(colleges)
def extract_years_of_experience(text):
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)
total_years = sum(map(int, years))
total_months = sum(map(int, months))
total_experience_years = total_years + (total_months // 12)
total_experience_months = total_months % 12
return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"
def extract_phone(text):
phone_patterns = [
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
]
for pattern in phone_patterns:
match = re.search(pattern, text)
if match:
return match.group()
return "Not found"
def extract_email(text):
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
match = re.search(email_pattern, text)
return match.group() if match else "Not found"
def extract_summary(doc):
sentences = list(doc.sents)
summary = []
for sent in sentences:
if len(summary) >= 3: # Limit to 3 sentences
break
if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
summary.append(sent.text)
return " ".join(summary)
def extract_linkedin(text):
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
match = re.search(linkedin_pattern, text)
return match.group() if match else "Not found"
def parse_resume(file):
try:
text = extract_text(file)
if text.startswith("Error") or text == "Unsupported file format":
return {"Error": text}
doc = nlp(text)
companies = extract_companies(text)
colleges = extract_colleges(text)
years_of_experience = extract_years_of_experience(text)
phone = extract_phone(text)
email = extract_email(text)
summary = extract_summary(doc)
linkedin = extract_linkedin(text)
return companies, colleges, years_of_experience, phone, email, summary, linkedin
except Exception as e:
import traceback
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
# Create Gradio interface with separate output components
iface = gr.Interface(
fn=parse_resume,
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
outputs=[
gr.Textbox(label="Companies Worked For", lines=10),
gr.Textbox(label="Colleges Attended", lines=10),
gr.Textbox(label="Years of Experience"),
gr.Textbox(label="Phone Number"),
gr.Textbox(label="Email ID"),
gr.Textbox(label="Summary", lines=3),
gr.Textbox(label="LinkedIn ID")
],
title="Advanced Resume Parser",
description="Upload a resume in PDF or DOCX format to extract key information."
)
iface.launch(share=True)