Spaces:
Sleeping
Sleeping
import requests | |
import json | |
import re | |
import numpy as np | |
from sentence_transformers import SentenceTransformer, util | |
from sklearn.metrics.pairwise import cosine_similarity | |
import spacy | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import sent_tokenize | |
# Langchain packages | |
from langchain.text_splitter import CharacterTextSplitter #text splitter | |
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models | |
from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain | |
from langchain import HuggingFaceHub | |
from langchain.chains.question_answering import load_qa_chain | |
from constants import StreamlitException | |
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS | |
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID | |
# Function to summarize resume text | |
def summarize_text(text, max_length=100): | |
if text != '': | |
data = json.dumps( | |
{ | |
"inputs": text, | |
"parameters": {"max_length": max_length} | |
} | |
) | |
response = requests.post(API_URL_summary, headers=HEADERS, data=data) | |
if response.status_code != 200: | |
return StreamlitException(f"**Error**: {response.status_code}") | |
try: | |
summary = response.json()[0]["summary_text"] | |
except (KeyError, IndexError): | |
return StreamlitException("**Error**: Invalid response from API.") | |
return summary | |
else: | |
return 'nan' | |
# Function to extract candidate name(s) from resume text | |
def extract_person_names_and_email(text): | |
print(text) | |
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) | |
data = json.dumps({"inputs": [text]}) | |
response = requests.post(API_URL_name, headers=HEADERS, data=data) | |
output = json.loads(response.content.decode("utf-8")) | |
print(output) | |
person_names = set() | |
for text in output[0]: | |
if text["entity_group"] == "PER": | |
person_names.add(text["word"]) | |
# Extract email addresses | |
print(text) | |
return set(person_names), set(emails) | |
# Function to extract key technical skills from resume text | |
def extract_tech_skills(doc): | |
keywords = [token.text.upper() for token in doc if token.text.lower() in TECH_SKILLS] | |
return set(keywords) | |
# Function to calculate overall percentage match between job description and resume | |
def calculate_similarity(job_description, resume): | |
if job_description != '': | |
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL) | |
job_description_embeddings = model.encode(job_description) | |
resume_embeddings = model.encode(resume) | |
similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings) | |
return similarity_score[0][0] * 100 | |
else: | |
return np.NaN | |
# Define a function to clean sentences | |
def clean_text(text): | |
# Remove bullet points | |
text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip() | |
# Remove more types of bullet points | |
text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE) | |
# Remove extra new lines | |
text = re.sub(r'\n+', '\n', text).strip() | |
# Remove any leading/trailing newlines | |
text = text.strip('\n') | |
# Remove any leading/trailing spaces | |
text = text.strip() | |
# Replace pipe symbol with a dot | |
text = re.sub(r'\s*\|\s*', '. ', text).strip() | |
# Add full stops to the end of each sentence | |
text = re.sub(r'([^.!?])\s*\n', r'\1. ', text) | |
# Capitalize the first letter of each sentence | |
text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text) | |
# Replace ' - ' with '. ' only if it's not part of a hyphenated word | |
text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text) | |
# Return cleaned text | |
return text | |
# Define a function to split sentences based on regular expressions | |
def split_text(string): | |
# Split the clean string into sentences | |
sentences = sent_tokenize(string) | |
return sentences | |
# Function to calculate overall percentage match | |
def get_average_similarity_scores(job_description, resumes): | |
# Calculate cosine similarity matrix between job description and resumes | |
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL) | |
job_description_embeddings = model.encode(job_description) | |
resume_embeddings = model.encode(resumes) | |
similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings) | |
# Calculate the average similarity score for each phrase in the job description across all phrases in the resumes | |
avg_similarity_scores = np.mean(similarity_matrix, axis=1) | |
# Return the average similarity scores as a list | |
return avg_similarity_scores.tolist() | |
# Function to respond to user Q&A | |
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100): | |
pages = loader.load_and_split() | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
docs = text_splitter.split_documents(pages) | |
embeddings = HuggingFaceEmbeddings() | |
db = FAISS.from_documents(docs, embeddings) | |
llm = HuggingFaceHub( | |
repo_id=LLM_REPO_ID, model_kwargs={ | |
"temperature": temperature, "max_length": max_length | |
}) | |
chain = load_qa_chain(llm, chain_type="stuff") | |
docs = db.similarity_search(query) | |
return chain.run(input_documents=docs, question=query) | |
# Load the English language model for spaCy | |
lang_model = spacy.load("en_core_web_sm") | |