import streamlit as st import google.generativeai as genai import fitz # PyMuPDF for PDF text extraction import spacy from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline from transformers import AutoModelForSeq2SeqLM from docx import Document import re import dateparser from datetime import datetime import os # Load SpaCy model nlp_spacy = spacy.load('en_core_web_sm') # Load Babelscape NER model tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner") model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner") nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="simple") # Load GLinER model gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large") gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large") def extract_info_with_gliner(text, info_type): input_text = f"Extract {info_type} from: {text}" input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids outputs = gliner_model.generate(input_ids, max_length=100) return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True) class EnhancedNERPipeline: def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer): self.nlp_spacy = nlp_spacy self.nlp_ner = nlp_ner self.gliner_model = gliner_model self.gliner_tokenizer = gliner_tokenizer def __call__(self, text): # SpaCy processing doc = self.nlp_spacy(text) # Babelscape NER processing ner_results = self.nlp_ner(text) # GLinER processing gliner_companies = extract_info_with_gliner(text, "company names") gliner_experience = extract_info_with_gliner(text, "years of experience") gliner_education = extract_info_with_gliner(text, "educational institutions") # Combine results combined_entities = doc.ents + tuple(ner_results) # Add GLinER results as custom attributes doc._.gliner_companies = gliner_companies.split(', ') doc._.gliner_experience = gliner_experience doc._.gliner_education = gliner_education.split(', ') # Update doc.ents with combined results for other entity types doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]] return doc # Create the enhanced pipeline enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer) def extract_companies(doc): gliner_companies = set(doc._.gliner_companies) spacy_babelscape_companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"]) return list(gliner_companies.union(spacy_babelscape_companies)) def extract_experience(doc): gliner_experience = int(re.search(r'\d+', doc._.gliner_experience).group()) if doc._.gliner_experience else 0 spacy_experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0]) return max(gliner_experience, spacy_experience) def extract_education(doc): gliner_education = set(doc._.gliner_education) spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])]) return list(gliner_education.union(spacy_babelscape_education)) def extract_text_from_pdf(file): pdf = fitz.open(stream=file.read(), filetype="pdf") text = "" for page in pdf: text += page.get_text() return text def extract_text_from_doc(file): doc = Document(file) return " ".join([paragraph.text for paragraph in doc.paragraphs]) def authenticate_gemini(api_key): try: genai.configure(api_key=api_key) model = genai.GenerativeModel('gemini-pro') return model except Exception as e: st.error(f"Authentication failed: {e}") return None def generate_summary(text, model): prompt = f"Summarize the following resume:\n\n{text}\n\nProvide a brief overview of the candidate's qualifications, experience, and key skills." response = model.generate_content(prompt) return response.text def main(): st.title("Enhanced Resume Analyzer with GLinER Focus") api_key = st.text_input("Enter your Google Gemini API key", type="password") uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"]) if uploaded_file is not None and api_key: try: model = authenticate_gemini(api_key) if model is None: return if uploaded_file.type == "application/pdf": resume_text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": resume_text = extract_text_from_doc(uploaded_file) else: st.error("Unsupported file format.") return # Process the resume text with the enhanced pipeline doc = enhanced_nlp(resume_text) companies = extract_companies(doc) experience = extract_experience(doc) education = extract_education(doc) # Use GLinER for other extractions phone = extract_info_with_gliner(resume_text, "phone number") email = extract_info_with_gliner(resume_text, "email address") linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile") st.subheader("Extracted Information") st.write(f"**Years of Experience:** {experience}") st.write("**Companies:**", ", ".join(companies)) st.write("**Education:**", ", ".join(education)) st.write(f"**Phone Number:** {phone}") st.write(f"**Email:** {email}") st.write(f"**LinkedIn:** {linkedin}") summary = generate_summary(resume_text, model) st.subheader("Resume Summary") st.write(summary) except Exception as e: st.error(f"Error during processing: {e}") if __name__ == "__main__": main()