File size: 5,729 Bytes
54f4f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2045eb
 
 
54f4f78
b2045eb
54f4f78
 
 
 
 
 
 
 
 
 
 
 
14cf189
54f4f78
 
 
 
 
 
 
b2045eb
54f4f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2045eb
 
54f4f78
 
 
b2045eb
54f4f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2045eb
54f4f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import requests
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

import spacy
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Langchain packages
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS  #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain

from constants import StreamlitException
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID

from streamlit import cache_data


# Function to summarize resume text
@cache_data(show_spinner=False)
def summarize_text(text, max_length=100):
    if text != '':
        data = json.dumps(
            {
                "inputs": text,
                "parameters": {"max_length": max_length}
            }
        )
        response = requests.post(API_URL_summary, headers=HEADERS, data=data)
        if response.status_code != 200:
            return StreamlitException(f"**Error**: {response.status_code}")
        try:
            summary = response.json()[0]["generated_text"]
        except (KeyError, IndexError):
            return StreamlitException("**Error**: Invalid response from API.")
        return summary
    else:
        return 'nan'
    
# Function to extract candidate name(s) from resume text
@cache_data(show_spinner=False)
def extract_person_names_and_email(text):
    print(text)
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    data = json.dumps({"inputs": [text]})
    response = requests.post(API_URL_name, headers=HEADERS, data=data)
    output = json.loads(response.content.decode("utf-8"))
    print(output)
    person_names = set()
    for text in output[0]:
        if text["entity_group"] == "PER":
            person_names.add(text["word"])
    # Extract email addresses
    print(text)
    return set(person_names), set(emails)

# Function to extract key technical skills from resume text
def extract_tech_skills(_doc):
    keywords = [token.text.upper() for token in _doc if token.text.lower() in TECH_SKILLS]
    return set(keywords)

# Function to calculate overall percentage match between job description and resume
@cache_data(show_spinner=False)
def calculate_similarity(job_description, resume):
    if job_description != '':
        model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
        job_description_embeddings = model.encode(job_description)
        resume_embeddings = model.encode(resume)
        similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
        return similarity_score[0][0] * 100
    else:
        return np.NaN
    
# Define a function to clean sentences
def clean_text(text):
    # Remove bullet points
    text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
    # Remove more types of bullet points
    text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
    # Remove extra new lines
    text = re.sub(r'\n+', '\n', text).strip()
    # Remove any leading/trailing newlines
    text = text.strip('\n')
    # Remove any leading/trailing spaces
    text = text.strip()
    # Replace pipe symbol with a dot
    text = re.sub(r'\s*\|\s*', '. ', text).strip()
    # Add full stops to the end of each sentence
    text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
    # Capitalize the first letter of each sentence
    text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
    # Replace ' - ' with '. ' only if it's not part of a hyphenated word
    text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
    # Return cleaned text
    return text


# Define a function to split sentences based on regular expressions
def split_text(string):
    # Split the clean string into sentences
    sentences = sent_tokenize(string)
    return sentences

# Function to calculate overall percentage match
@cache_data(show_spinner=False)
def get_average_similarity_scores(job_description, resumes):
    # Calculate cosine similarity matrix between job description and resumes
    model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
    job_description_embeddings = model.encode(job_description)
    resume_embeddings = model.encode(resumes)
    similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
    # Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
    avg_similarity_scores = np.mean(similarity_matrix, axis=1)
    # Return the average similarity scores as a list
    return avg_similarity_scores.tolist()

# Function to respond to user Q&A
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(pages)
    embeddings = HuggingFaceEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    llm = HuggingFaceHub(
        repo_id=LLM_REPO_ID, model_kwargs={
            "temperature": temperature, "max_length": max_length
        })
    chain = load_qa_chain(llm, chain_type="stuff")
    docs = db.similarity_search(query)
    return chain.run(input_documents=docs, question=query)


# Load the English language model for spaCy
lang_model = spacy.load("en_core_web_sm")