Spaces:

cm0805
/

ResuMate_NVIDIA

Sleeping

File size: 5,729 Bytes

import requests
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Langchain packages
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS  #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

from constants import StreamlitException
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID

from streamlit import cache_data


# Function to summarize resume text
@cache_data(show_spinner=False)
def summarize_text(text, max_length=100):
    if text != '':
        data = json.dumps(
            {
                "inputs": text,
                "parameters": {"max_length": max_length}
            }
        )
        response = requests.post(API_URL_summary, headers=HEADERS, data=data)
        if response.status_code != 200:
            return StreamlitException(f"**Error**: {response.status_code}")
        try:
            summary = response.json()[0]["generated_text"]
        except (KeyError, IndexError):
            return StreamlitException("**Error**: Invalid response from API.")
        return summary
    else:
        return 'nan'
    
# Function to extract candidate name(s) from resume text
@cache_data(show_spinner=False)
def extract_person_names_and_email(text):
    print(text)
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    data = json.dumps({"inputs": [text]})
    response = requests.post(API_URL_name, headers=HEADERS, data=data)
    output = json.loads(response.content.decode("utf-8"))
    print(output)
    person_names = set()
    for text in output[0]:
        if text["entity_group"] == "PER":
            person_names.add(text["word"])
    # Extract email addresses
    print(text)
    return set(person_names), set(emails)

# Function to extract key technical skills from resume text
def extract_tech_skills(_doc):
    keywords = [token.text.upper() for token in _doc if token.text.lower() in TECH_SKILLS]
    return set(keywords)

# Function to calculate overall percentage match between job description and resume
@cache_data(show_spinner=False)
def calculate_similarity(job_description, resume):
    if job_description != '':
        model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
        job_description_embeddings = model.encode(job_description)
        resume_embeddings = model.encode(resume)
        similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
        return similarity_score[0][0] * 100
    else:
        return np.NaN
    
# Define a function to clean sentences
def clean_text(text):
    # Remove bullet points
    text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
    # Remove more types of bullet points
    text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
    # Remove extra new lines
    text = re.sub(r'\n+', '\n', text).strip()
    # Remove any leading/trailing newlines
    text = text.strip('\n')
    # Remove any leading/trailing spaces
    text = text.strip()
    # Replace pipe symbol with a dot
    text = re.sub(r'\s*\|\s*', '. ', text).strip()
    # Add full stops to the end of each sentence
    text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
    # Capitalize the first letter of each sentence
    text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
    # Replace ' - ' with '. ' only if it's not part of a hyphenated word
    text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
    # Return cleaned text
    return text


# Define a function to split sentences based on regular expressions
def split_text(string):
    # Split the clean string into sentences
    sentences = sent_tokenize(string)
    return sentences

# Function to calculate overall percentage match
@cache_data(show_spinner=False)
def get_average_similarity_scores(job_description, resumes):
    # Calculate cosine similarity matrix between job description and resumes
    model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
    job_description_embeddings = model.encode(job_description)
    resume_embeddings = model.encode(resumes)
    similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
    # Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
    avg_similarity_scores = np.mean(similarity_matrix, axis=1)
    # Return the average similarity scores as a list
    return avg_similarity_scores.tolist()

# Function to respond to user Q&A
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(pages)
    embeddings = HuggingFaceEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    llm = HuggingFaceHub(
        repo_id=LLM_REPO_ID, model_kwargs={
            "temperature": temperature, "max_length": max_length
        })
    chain = load_qa_chain(llm, chain_type="stuff")
    docs = db.similarity_search(query)
    return chain.run(input_documents=docs, question=query)


# Load the English language model for spaCy
lang_model = spacy.load("en_core_web_sm")