Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spacy | |
import re | |
import pdfplumber | |
import docx | |
import nltk | |
from nltk.corpus import words | |
from spacy.cli import download | |
# Download the SpaCy model if it doesn't exist | |
download("en_core_web_sm") | |
# Load the spaCy model | |
nlp = spacy.load("en_core_web_sm") | |
# Set of English words | |
nltk.download('words', quiet=True) | |
english_words = set(words.words()) | |
def extract_text(file): | |
try: | |
if file.name.endswith('.pdf'): | |
return extract_text_from_pdf(file) | |
elif file.name.endswith('.docx'): | |
return extract_text_from_docx(file) | |
else: | |
return "Unsupported file format" | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
def extract_text_from_pdf(file): | |
text = '' | |
with pdfplumber.open(file) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() or '' | |
return text | |
def extract_text_from_docx(file): | |
doc = docx.Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def extract_companies(text): | |
doc = nlp(text) | |
companies = [] | |
company_pattern = re.compile( | |
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE) | |
for ent in doc.ents: | |
if ent.label_ == "ORG" and company_pattern.search(ent.text): | |
companies.append(ent.text) | |
# Join companies with new lines | |
return "\n".join(companies) | |
def extract_colleges(text): | |
doc = nlp(text) | |
colleges = [] | |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"] | |
for sent in doc.sents: | |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)] | |
for edu in edu_ents: | |
colleges.append(edu.text) | |
# Join colleges with new lines | |
return "\n".join(colleges) | |
def extract_years_of_experience(text): | |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE) | |
months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE) | |
total_years = sum(map(int, years)) | |
total_months = sum(map(int, months)) | |
total_experience_years = total_years + (total_months // 12) | |
total_experience_months = total_months % 12 | |
return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available" | |
def extract_phone(text): | |
phone_patterns = [ | |
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b', | |
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' | |
] | |
for pattern in phone_patterns: | |
match = re.search(pattern, text) | |
if match: | |
return match.group() | |
return "Not found" | |
def extract_email(text): | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
match = re.search(email_pattern, text) | |
return match.group() if match else "Not found" | |
def extract_summary(doc): | |
sentences = list(doc.sents) | |
summary = [] | |
for sent in sentences: | |
if len(summary) >= 3: # Limit to 3 sentences | |
break | |
if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7: | |
summary.append(sent.text) | |
return " ".join(summary) | |
def extract_linkedin(text): | |
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?' | |
match = re.search(linkedin_pattern, text) | |
return match.group() if match else "Not found" | |
def parse_resume(file): | |
try: | |
text = extract_text(file) | |
if text.startswith("Error") or text == "Unsupported file format": | |
return {"Error": text} | |
doc = nlp(text) | |
companies = extract_companies(text) | |
colleges = extract_colleges(text) | |
years_of_experience = extract_years_of_experience(text) | |
phone = extract_phone(text) | |
email = extract_email(text) | |
summary = extract_summary(doc) | |
linkedin = extract_linkedin(text) | |
return companies, colleges, years_of_experience, phone, email, summary, linkedin | |
except Exception as e: | |
import traceback | |
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
# Create Gradio interface with separate output components | |
iface = gr.Interface( | |
fn=parse_resume, | |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"), | |
outputs=[ | |
gr.Textbox(label="Companies Worked For", lines=10), | |
gr.Textbox(label="Colleges Attended", lines=10), | |
gr.Textbox(label="Years of Experience"), | |
gr.Textbox(label="Phone Number"), | |
gr.Textbox(label="Email ID"), | |
gr.Textbox(label="Summary", lines=3), | |
gr.Textbox(label="LinkedIn ID") | |
], | |
title="Advanced Resume Parser", | |
description="Upload a resume in PDF or DOCX format to extract key information." | |
) | |
iface.launch(share=True) | |