Spaces:
Sleeping
Sleeping
File size: 5,108 Bytes
25c62c3 b740a24 25c62c3 cc2242c 25c62c3 cc2242c 9f062d8 25c62c3 cc2242c b740a24 25c62c3 cc2242c 25c62c3 e39d53c b740a24 25c62c3 b740a24 8bfcb85 cc2242c b740a24 8bfcb85 b740a24 8bfcb85 cc2242c 8bfcb85 b740a24 8bfcb85 b740a24 8bfcb85 25c62c3 8bfd778 8bfcb85 e39d53c 8bfcb85 cc2242c 8bfcb85 cc2242c 8bfcb85 cc2242c 3453a71 8bfcb85 25c62c3 3453a71 cc2242c 8bfcb85 9f062d8 8bfcb85 9f062d8 8bfcb85 9f062d8 cc2242c 8bfcb85 25c62c3 8bfcb85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import re
import PyPDF2
import gradio as gr
from transformers import pipeline
from collections import Counter
# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
# Load Job Category Classifier
text_classifier = pipeline("text-classification", model="serbog/distilbert-jobCategory_410k")
# Mapping from category code to readable label
CATEGORY_MAP = {
"C1": "Engineering",
"C2": "Information Technology",
"C3": "Sales & Marketing",
"C4": "Accounting & Finance",
"C5": "Healthcare",
"D1": "Education",
"D2": "Human Resources",
"E1": "Operations & Logistics",
"E2": "Legal",
"F1": "Customer Support",
"Other": "General / Undefined"
}
def clean_resume_text(text):
"""Clean text by removing unwanted characters and formatting."""
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'#\S+', '', text)
text = re.sub(r'@\S+', ' ', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'[^\x00-\x7f]', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
def extract_resume_text(file):
"""Extract raw text from uploaded PDF."""
try:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
if not text.strip():
return None, "Error: No text extracted from PDF."
return text, None
except Exception as e:
return None, f"Error reading PDF: {str(e)}"
def classify_resume_ner(entities):
"""Basic rule-based NER classification using ORG, LOC, MISC."""
orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']
dominant_org = Counter(orgs).most_common(1)
dominant_loc = Counter(locs).most_common(1)
dominant_job = Counter(jobs).most_common(1)
return {
"Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
"Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
"Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General"
}
def process_resumes(files):
"""Extract entities and show classification based on NER."""
all_results = {}
for file in files:
file_name = file.name.split("/")[-1]
resume_text, error = extract_resume_text(file)
if error:
all_results[file_name] = {"error": error}
continue
cleaned_text = clean_resume_text(resume_text)
entities = ner_pipeline(cleaned_text)
classification = classify_resume_ner(entities)
all_results[file_name] = {
"Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
"Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
"Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
"Other Entities": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
"Cleaned_Text": cleaned_text,
"Classification (NER)": classification
}
return all_results
def classify_resumes_with_model(files):
"""Use job category model to classify resume into readable job field."""
predictions = {}
for file in files:
file_name = file.name.split("/")[-1]
resume_text, error = extract_resume_text(file)
if error:
predictions[file_name] = {"error": error}
continue
cleaned_text = clean_resume_text(resume_text)
result = text_classifier(cleaned_text[:512]) # Truncate for safety
raw_label = result[0]['label']
readable_label = CATEGORY_MAP.get(raw_label, "Unknown")
predictions[file_name] = {
"Predicted Job Category": readable_label,
"Raw Label": raw_label,
"Confidence Score": round(result[0]['score'], 4)
}
return predictions
# Gradio Interface
with gr.Blocks(title="Resume Analyzer") as demo:
gr.Markdown("## π Multi-Resume Entity Extractor & Job Classifier\nUpload multiple PDF resumes. This tool extracts entities using NER and predicts the job field using a trained classifier model.")
with gr.Row():
file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple")
with gr.Row():
extract_button = gr.Button("π Extract Entities (NER)")
classify_button = gr.Button("π§ Predict Job Category (Model)")
output_entities = gr.JSON(label="NER Results & Classification")
output_class = gr.JSON(label="Model-Predicted Job Category")
extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities])
classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class])
if __name__ == "__main__":
demo.launch()
|