Spaces:
Sleeping
Sleeping
Create nlp.py
Browse files
nlp.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import numpy as np
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
|
8 |
+
import spacy
|
9 |
+
import nltk
|
10 |
+
nltk.download('punkt')
|
11 |
+
from nltk.tokenize import sent_tokenize
|
12 |
+
|
13 |
+
# Langchain packages
|
14 |
+
from langchain.text_splitter import CharacterTextSplitter #text splitter
|
15 |
+
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
|
16 |
+
from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
|
17 |
+
from langchain import HuggingFaceHub
|
18 |
+
from langchain.chains.question_answering import load_qa_chain
|
19 |
+
|
20 |
+
from constants import StreamlitException
|
21 |
+
from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
|
22 |
+
from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID
|
23 |
+
|
24 |
+
# Function to summarize resume text
|
25 |
+
def summarize_text(text, max_length=100):
|
26 |
+
if text != '':
|
27 |
+
data = json.dumps(
|
28 |
+
{
|
29 |
+
"inputs": text,
|
30 |
+
"parameters": {"max_length": max_length}
|
31 |
+
}
|
32 |
+
)
|
33 |
+
response = requests.post(API_URL_summary, headers=HEADERS, data=data)
|
34 |
+
if response.status_code != 200:
|
35 |
+
return StreamlitException(f"**Error**: {response.status_code}")
|
36 |
+
try:
|
37 |
+
summary = response.json()[0]["summary_text"]
|
38 |
+
except (KeyError, IndexError):
|
39 |
+
return StreamlitException("**Error**: Invalid response from API.")
|
40 |
+
return summary
|
41 |
+
else:
|
42 |
+
return 'nan'
|
43 |
+
|
44 |
+
# Function to extract candidate name(s) from resume text
|
45 |
+
def extract_person_names_and_email(text):
|
46 |
+
print(text)
|
47 |
+
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
|
48 |
+
data = json.dumps({"inputs": [text]})
|
49 |
+
response = requests.post(API_URL_name, headers=HEADERS, data=data)
|
50 |
+
output = json.loads(response.content.decode("utf-8"))
|
51 |
+
print(output)
|
52 |
+
person_names = set()
|
53 |
+
for text in output[0]:
|
54 |
+
if text["entity_group"] == "PER":
|
55 |
+
person_names.add(text["word"])
|
56 |
+
# Extract email addresses
|
57 |
+
print(text)
|
58 |
+
return set(person_names), set(emails)
|
59 |
+
|
60 |
+
# Function to extract key technical skills from resume text
|
61 |
+
def extract_tech_skills(doc):
|
62 |
+
keywords = [token.text.upper() for token in doc if token.text.lower() in TECH_SKILLS]
|
63 |
+
return set(keywords)
|
64 |
+
|
65 |
+
# Function to calculate overall percentage match between job description and resume
|
66 |
+
def calculate_similarity(job_description, resume):
|
67 |
+
if job_description != '':
|
68 |
+
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
|
69 |
+
job_description_embeddings = model.encode(job_description)
|
70 |
+
resume_embeddings = model.encode(resume)
|
71 |
+
similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
|
72 |
+
return similarity_score[0][0] * 100
|
73 |
+
else:
|
74 |
+
return np.NaN
|
75 |
+
|
76 |
+
# Define a function to clean sentences
|
77 |
+
def clean_text(text):
|
78 |
+
# Remove bullet points
|
79 |
+
text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
|
80 |
+
# Remove more types of bullet points
|
81 |
+
text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
|
82 |
+
# Remove extra new lines
|
83 |
+
text = re.sub(r'\n+', '\n', text).strip()
|
84 |
+
# Remove any leading/trailing newlines
|
85 |
+
text = text.strip('\n')
|
86 |
+
# Remove any leading/trailing spaces
|
87 |
+
text = text.strip()
|
88 |
+
# Replace pipe symbol with a dot
|
89 |
+
text = re.sub(r'\s*\|\s*', '. ', text).strip()
|
90 |
+
# Add full stops to the end of each sentence
|
91 |
+
text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
|
92 |
+
# Capitalize the first letter of each sentence
|
93 |
+
text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
|
94 |
+
# Replace ' - ' with '. ' only if it's not part of a hyphenated word
|
95 |
+
text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
|
96 |
+
# Return cleaned text
|
97 |
+
return text
|
98 |
+
|
99 |
+
|
100 |
+
# Define a function to split sentences based on regular expressions
|
101 |
+
def split_text(string):
|
102 |
+
# Split the clean string into sentences
|
103 |
+
sentences = sent_tokenize(string)
|
104 |
+
return sentences
|
105 |
+
|
106 |
+
# Function to calculate overall percentage match
|
107 |
+
def get_average_similarity_scores(job_description, resumes):
|
108 |
+
# Calculate cosine similarity matrix between job description and resumes
|
109 |
+
model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
|
110 |
+
job_description_embeddings = model.encode(job_description)
|
111 |
+
resume_embeddings = model.encode(resumes)
|
112 |
+
similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
|
113 |
+
# Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
|
114 |
+
avg_similarity_scores = np.mean(similarity_matrix, axis=1)
|
115 |
+
# Return the average similarity scores as a list
|
116 |
+
return avg_similarity_scores.tolist()
|
117 |
+
|
118 |
+
# Function to respond to user Q&A
|
119 |
+
def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
|
120 |
+
pages = loader.load_and_split()
|
121 |
+
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
122 |
+
docs = text_splitter.split_documents(pages)
|
123 |
+
embeddings = HuggingFaceEmbeddings()
|
124 |
+
db = FAISS.from_documents(docs, embeddings)
|
125 |
+
llm = HuggingFaceHub(
|
126 |
+
repo_id=LLM_REPO_ID, model_kwargs={
|
127 |
+
"temperature": temperature, "max_length": max_length
|
128 |
+
})
|
129 |
+
chain = load_qa_chain(llm, chain_type="stuff")
|
130 |
+
docs = db.similarity_search(query)
|
131 |
+
return chain.run(input_documents=docs, question=query)
|
132 |
+
|
133 |
+
|
134 |
+
# Load the English language model for spaCy
|
135 |
+
lang_model = spacy.load("en_core_web_sm")
|