Spaces:

cm0805
/

ResuMate_NVIDIA

Sleeping

App Files Files Community

ResuMate_NVIDIA / nlp.py

cm0805

Create nlp.py

54f4f78 over 1 year ago

raw

history blame

5.56 kB

	import requests
	import json
	import re
	import numpy as np
	from sentence_transformers import SentenceTransformer, util
	from sklearn.metrics.pairwise import cosine_similarity

	import spacy
	import nltk
	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize

	# Langchain packages
	from langchain.text_splitter import CharacterTextSplitter #text splitter
	from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
	from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
	from langchain import HuggingFaceHub
	from langchain.chains.question_answering import load_qa_chain

	from constants import StreamlitException
	from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
	from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID

	# Function to summarize resume text
	def summarize_text(text, max_length=100):
	if text != '':
	data = json.dumps(
	{
	"inputs": text,
	"parameters": {"max_length": max_length}
	}
	)
	response = requests.post(API_URL_summary, headers=HEADERS, data=data)
	if response.status_code != 200:
	return StreamlitException(f"Error: {response.status_code}")
	try:
	summary = response.json()[0]["summary_text"]
	except (KeyError, IndexError):
	return StreamlitException("Error: Invalid response from API.")
	return summary
	else:
	return 'nan'

	# Function to extract candidate name(s) from resume text
	def extract_person_names_and_email(text):
	print(text)
	emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)
	data = json.dumps({"inputs": [text]})
	response = requests.post(API_URL_name, headers=HEADERS, data=data)
	output = json.loads(response.content.decode("utf-8"))
	print(output)
	person_names = set()
	for text in output[0]:
	if text["entity_group"] == "PER":
	person_names.add(text["word"])
	# Extract email addresses
	print(text)
	return set(person_names), set(emails)

	# Function to extract key technical skills from resume text
	def extract_tech_skills(doc):
	keywords = [token.text.upper() for token in doc if token.text.lower() in TECH_SKILLS]
	return set(keywords)

	# Function to calculate overall percentage match between job description and resume
	def calculate_similarity(job_description, resume):
	if job_description != '':
	model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
	job_description_embeddings = model.encode(job_description)
	resume_embeddings = model.encode(resume)
	similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
	return similarity_score[0][0] * 100
	else:
	return np.NaN

	# Define a function to clean sentences
	def clean_text(text):
	# Remove bullet points
	text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
	# Remove more types of bullet points
	text = re.sub(r'^\s[-•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
	# Remove extra new lines
	text = re.sub(r'\n+', '\n', text).strip()
	# Remove any leading/trailing newlines
	text = text.strip('\n')
	# Remove any leading/trailing spaces
	text = text.strip()
	# Replace pipe symbol with a dot
	text = re.sub(r'\s\\|\s', '. ', text).strip()
	# Add full stops to the end of each sentence
	text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
	# Capitalize the first letter of each sentence
	text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
	# Replace ' - ' with '. ' only if it's not part of a hyphenated word
	text = re.sub(r'(?<![^\W\d_])-(?!\d\|\w-)(?<!\d)\s', '. ', text)
	# Return cleaned text
	return text


	# Define a function to split sentences based on regular expressions
	def split_text(string):
	# Split the clean string into sentences
	sentences = sent_tokenize(string)
	return sentences

	# Function to calculate overall percentage match
	def get_average_similarity_scores(job_description, resumes):
	# Calculate cosine similarity matrix between job description and resumes
	model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
	job_description_embeddings = model.encode(job_description)
	resume_embeddings = model.encode(resumes)
	similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
	# Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
	avg_similarity_scores = np.mean(similarity_matrix, axis=1)
	# Return the average similarity scores as a list
	return avg_similarity_scores.tolist()

	# Function to respond to user Q&A
	def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
	pages = loader.load_and_split()
	text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	docs = text_splitter.split_documents(pages)
	embeddings = HuggingFaceEmbeddings()
	db = FAISS.from_documents(docs, embeddings)
	llm = HuggingFaceHub(
	repo_id=LLM_REPO_ID, model_kwargs={
	"temperature": temperature, "max_length": max_length
	})
	chain = load_qa_chain(llm, chain_type="stuff")
	docs = db.similarity_search(query)
	return chain.run(input_documents=docs, question=query)


	# Load the English language model for spaCy
	lang_model = spacy.load("en_core_web_sm")