cm0805 commited on
Commit
54f4f78
1 Parent(s): 6cad64a

Create nlp.py

Browse files
Files changed (1) hide show
  1. nlp.py +135 -0
nlp.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ import spacy
9
+ import nltk
10
+ nltk.download('punkt')
11
+ from nltk.tokenize import sent_tokenize
12
+
13
+ # Langchain packages
14
+ from langchain.text_splitter import CharacterTextSplitter #text splitter
15
+ from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
16
+ from langchain.vectorstores import FAISS #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
17
+ from langchain import HuggingFaceHub
18
+ from langchain.chains.question_answering import load_qa_chain
19
+
20
+ from constants import StreamlitException
21
+ from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
22
+ from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID
23
+
24
+ # Function to summarize resume text
25
+ def summarize_text(text, max_length=100):
26
+ if text != '':
27
+ data = json.dumps(
28
+ {
29
+ "inputs": text,
30
+ "parameters": {"max_length": max_length}
31
+ }
32
+ )
33
+ response = requests.post(API_URL_summary, headers=HEADERS, data=data)
34
+ if response.status_code != 200:
35
+ return StreamlitException(f"**Error**: {response.status_code}")
36
+ try:
37
+ summary = response.json()[0]["summary_text"]
38
+ except (KeyError, IndexError):
39
+ return StreamlitException("**Error**: Invalid response from API.")
40
+ return summary
41
+ else:
42
+ return 'nan'
43
+
44
+ # Function to extract candidate name(s) from resume text
45
+ def extract_person_names_and_email(text):
46
+ print(text)
47
+ emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
48
+ data = json.dumps({"inputs": [text]})
49
+ response = requests.post(API_URL_name, headers=HEADERS, data=data)
50
+ output = json.loads(response.content.decode("utf-8"))
51
+ print(output)
52
+ person_names = set()
53
+ for text in output[0]:
54
+ if text["entity_group"] == "PER":
55
+ person_names.add(text["word"])
56
+ # Extract email addresses
57
+ print(text)
58
+ return set(person_names), set(emails)
59
+
60
+ # Function to extract key technical skills from resume text
61
+ def extract_tech_skills(doc):
62
+ keywords = [token.text.upper() for token in doc if token.text.lower() in TECH_SKILLS]
63
+ return set(keywords)
64
+
65
+ # Function to calculate overall percentage match between job description and resume
66
+ def calculate_similarity(job_description, resume):
67
+ if job_description != '':
68
+ model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
69
+ job_description_embeddings = model.encode(job_description)
70
+ resume_embeddings = model.encode(resume)
71
+ similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
72
+ return similarity_score[0][0] * 100
73
+ else:
74
+ return np.NaN
75
+
76
+ # Define a function to clean sentences
77
+ def clean_text(text):
78
+ # Remove bullet points
79
+ text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
80
+ # Remove more types of bullet points
81
+ text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
82
+ # Remove extra new lines
83
+ text = re.sub(r'\n+', '\n', text).strip()
84
+ # Remove any leading/trailing newlines
85
+ text = text.strip('\n')
86
+ # Remove any leading/trailing spaces
87
+ text = text.strip()
88
+ # Replace pipe symbol with a dot
89
+ text = re.sub(r'\s*\|\s*', '. ', text).strip()
90
+ # Add full stops to the end of each sentence
91
+ text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
92
+ # Capitalize the first letter of each sentence
93
+ text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
94
+ # Replace ' - ' with '. ' only if it's not part of a hyphenated word
95
+ text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
96
+ # Return cleaned text
97
+ return text
98
+
99
+
100
+ # Define a function to split sentences based on regular expressions
101
+ def split_text(string):
102
+ # Split the clean string into sentences
103
+ sentences = sent_tokenize(string)
104
+ return sentences
105
+
106
+ # Function to calculate overall percentage match
107
+ def get_average_similarity_scores(job_description, resumes):
108
+ # Calculate cosine similarity matrix between job description and resumes
109
+ model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
110
+ job_description_embeddings = model.encode(job_description)
111
+ resume_embeddings = model.encode(resumes)
112
+ similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
113
+ # Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
114
+ avg_similarity_scores = np.mean(similarity_matrix, axis=1)
115
+ # Return the average similarity scores as a list
116
+ return avg_similarity_scores.tolist()
117
+
118
+ # Function to respond to user Q&A
119
+ def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
120
+ pages = loader.load_and_split()
121
+ text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
122
+ docs = text_splitter.split_documents(pages)
123
+ embeddings = HuggingFaceEmbeddings()
124
+ db = FAISS.from_documents(docs, embeddings)
125
+ llm = HuggingFaceHub(
126
+ repo_id=LLM_REPO_ID, model_kwargs={
127
+ "temperature": temperature, "max_length": max_length
128
+ })
129
+ chain = load_qa_chain(llm, chain_type="stuff")
130
+ docs = db.similarity_search(query)
131
+ return chain.run(input_documents=docs, question=query)
132
+
133
+
134
+ # Load the English language model for spaCy
135
+ lang_model = spacy.load("en_core_web_sm")