fazni commited on
Commit
b06ff0c
·
1 Parent(s): 2a4d161

added app.py file with all other files

Browse files
Files changed (7) hide show
  1. FindKeyword.py +11 -0
  2. PreprocessText.py +28 -0
  3. app.py +243 -0
  4. htmlTemplates.py +44 -0
  5. model_Responce.py +38 -0
  6. models/model.h5 +3 -0
  7. requirements.txt +17 -0
FindKeyword.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ def FindKeyWords(keywords, text):
3
+ highlighted_text = text
4
+
5
+ for keyword in keywords:
6
+ if re.search(r'\b({0})\b'.format(re.escape(keyword)), highlighted_text, flags=re.IGNORECASE):
7
+ highlighted_text = re.sub(r'\b({0})\b'.format(re.escape(keyword)), r'<mark style="background-color: yellow;">\1</mark>', highlighted_text, flags=re.IGNORECASE)
8
+ else:
9
+ return "Keyword not found in the Resume."
10
+
11
+ return highlighted_text
PreprocessText.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def preprocess_text(text):
4
+ # Remove newlines and tabs
5
+ text = re.sub(r'\n|\t', '', text)
6
+
7
+ # Remove letter combinations between spaces
8
+ text = re.sub(r'\s[A-Z]\s', ' ', text)
9
+
10
+ # Remove emails
11
+ text = re.sub(r'\S+@\S+', '', text)
12
+
13
+ # Remove dates in the format DD-MM-YYYY or DD/MM/YYYY
14
+ text = re.sub(r'\d{2}[-/]\d{2}[-/]\d{4}', '', text)
15
+
16
+ # Remove phone numbers
17
+ text = re.sub(r'\+\d{2}\s?\d{2,3}\s?\d{3,4}\s?\d{4}', '', text)
18
+
19
+ # Remove specific text format
20
+ text = re.sub(r'Issued\s\w+\s\d{4}Credential ID \w+', '', text)
21
+
22
+ # Remove extra spaces between words
23
+ text = re.sub(r'\s+', ' ', text)
24
+
25
+ # Add a space before a word containing a capital letter in the middle
26
+ text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
27
+
28
+ return text
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from dotenv import load_dotenv
5
+ from FindKeyword import FindKeyWords
6
+ from PreprocessText import preprocess_text
7
+ from model_Responce import model_prediction
8
+ from streamlit_extras.add_vertical_space import add_vertical_space
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
11
+ from langchain.vectorstores import FAISS
12
+ # from langchain.chat_models import ChatOpenAI
13
+ # from langchain.memory import ConversationBufferMemory
14
+ # from langchain.chains import ConversationalRetrievalChain
15
+ from htmlTemplates import css, bot_template, user_template
16
+ from InstructorEmbedding import INSTRUCTOR
17
+ import numpy as np
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+
20
+ def get_text_chunks(text):
21
+ text_splitter = CharacterTextSplitter(
22
+ separator="\n",
23
+ chunk_size=1000,
24
+ chunk_overlap=200,
25
+ length_function=len
26
+ )
27
+ chunks = text_splitter.split_text(text)
28
+ return chunks
29
+
30
+ # Assuming this function encodes the question into a vector representation
31
+ def encode_question(question):
32
+ embeddings = HuggingFaceInstructEmbeddings() # Instantiate the embeddings model
33
+ question_vector = embeddings.embed_query(question) # Encode the question into a vector
34
+ return question_vector
35
+
36
+ # def handle_user_input(question):
37
+ # response = st.session_state.conversation({'question':question})
38
+ # st.session_state.chat_history = response('chat_history')
39
+
40
+ # for i,message in enumerate(st.session_state.chat_history):
41
+ # if i % 2 == 0:
42
+ # st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
43
+ # else:
44
+ # st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
45
+
46
+ # def get_conversation_chain(vector_store):
47
+ # llm = ChatOpenAI()
48
+ # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
49
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
50
+ # llm=llm,
51
+ # retriever=vector_store.as_retriever(),
52
+ # memory = memory
53
+ # )
54
+ # return conversation_chain
55
+
56
+ def save_vector_store(text_chunks):
57
+ # embeddings = OpenAIEmbeddings()
58
+ # model = INSTRUCTOR('hkunlp/instructor-base')
59
+ # embeddings = model.encode(raw_text)
60
+ embeddings = HuggingFaceInstructEmbeddings()
61
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
62
+ new_db = FAISS.load_local("faiss_index_V2", embeddings)
63
+ new_db.merge_from(vectorstore)
64
+ new_db.save_local('faiss_index_V2')
65
+
66
+ return st.write("vector Store is Saved")
67
+
68
+ def button_function(all_text):
69
+ # Add your desired functionality here
70
+ # predictions = []
71
+ for item in all_text:
72
+ text = item['text']
73
+ # filename = item['filename']
74
+ pred = model_prediction(text)
75
+ # predictions.append({"filename": filename, "prediction": pred})
76
+ item['prediction'] = pred
77
+ return all_text
78
+
79
+ def get_pdf_text(pdfs,preprocess=True):
80
+ if preprocess:
81
+ all_text = []
82
+ for pdf in pdfs:
83
+ # Process each uploaded PDF file
84
+ # Reading PDF
85
+ pdf_reader = PdfReader(pdf)
86
+
87
+ # Get the filename of the PDF
88
+ filename = pdf.name
89
+
90
+ text = ""
91
+ # Reading Each Page
92
+ for page in pdf_reader.pages:
93
+ # Extracting Text in Every Page
94
+ text += page.extract_text()
95
+ # Preprocess the text
96
+ text = preprocess_text(text)
97
+ # Appending to array
98
+ all_text.append({"filename": filename, "text": text})
99
+ return all_text
100
+
101
+ else:
102
+ text = ""
103
+ for pdf in pdfs:
104
+ # Process each uploaded PDF file
105
+ # Reading PDF
106
+ pdf_reader = PdfReader(pdf)
107
+
108
+ # Reading Each Page
109
+ for page in pdf_reader.pages:
110
+ # Extracting Text in Every Page
111
+ text += page.extract_text()
112
+
113
+ # text = preprocess_text(text)
114
+ return text
115
+
116
+ def filter_keywords(all_text, keywords):
117
+ filtered_text = []
118
+ for item in all_text:
119
+ filename = item['filename']
120
+ text = item['text']
121
+ filtered_text_with_keywords = FindKeyWords(keywords, text)
122
+ filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
123
+ return filtered_text
124
+
125
+
126
+ # Main body
127
+ def main():
128
+ # vector_store = None
129
+ load_dotenv()
130
+ st.header("Resume Filter using Keywords 💬")
131
+
132
+ # Sidebar contents
133
+ with st.sidebar:
134
+ st.title('🤗💬 LLM Chat App')
135
+ # upload a PDF file
136
+ pdfs = st.file_uploader("Upload your Resumes", type='pdf',accept_multiple_files=True)
137
+
138
+ # Get user preference for matching keywords
139
+ # match_all_keywords = st.checkbox("Match All Keywords")
140
+
141
+ # Choose functionality: Prediction or Filtering
142
+ functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
143
+ if functionality == "Ask Questions":
144
+ if st.button('Process'):
145
+ with st.spinner("Processing"):
146
+ # get pdf text
147
+ raw_text = get_pdf_text(pdfs, preprocess=False)
148
+
149
+ # get the text chunk
150
+ text_chunks = get_text_chunks(raw_text)
151
+
152
+ # create vector store
153
+ save_vector_store(text_chunks)
154
+ add_vertical_space(5)
155
+ st.write('Made with ❤️ by Fazni Farook')
156
+
157
+
158
+ if pdfs is not None:
159
+ all_text = get_pdf_text(pdfs)
160
+
161
+ # if 'conversation' not in st.session_state:
162
+ # st.session_state.conversation = None
163
+
164
+ # if 'chat_history' not in st.session_state:
165
+ # st.session_state.chat_history = None
166
+
167
+ if functionality == "Make Predictions":
168
+ if st.button('Make Prediction'):
169
+ with st.spinner("Progressing"):
170
+ all_text = button_function(all_text)
171
+
172
+ for item in all_text:
173
+ filename = item["filename"]
174
+ text = item["text"]
175
+ pred = item["prediction"]
176
+ st.markdown(f"**Filename: {filename}**")
177
+ # st.markdown(text, unsafe_allow_html=True)
178
+ st.markdown(f"**Prediction: {pred}**")
179
+ st.markdown("---")
180
+
181
+ elif functionality == "Filter Keywords":
182
+ # getting the keywords
183
+ keyword_input = st.text_input("Keyword")
184
+ keywords = [keyword.strip() for keyword in keyword_input.split(",")]
185
+
186
+ if st.button('Filter Keywords'):
187
+ with st.spinner("Progressing"):
188
+ filtered_text = filter_keywords(all_text, keywords)
189
+
190
+ for item in filtered_text:
191
+ filename = item["filename"]
192
+ text = item["text"]
193
+ st.markdown(f"**Filename: {filename}**")
194
+ st.markdown(text, unsafe_allow_html=True)
195
+ st.markdown("---")
196
+
197
+ elif functionality == "Predict the Suitable canditate":
198
+ # getting the keywords
199
+ keyword = st.text_input("Keyword")
200
+
201
+ if st.button('Filter Resumes'):
202
+ with st.spinner("Progressing"):
203
+ all_text = button_function(all_text)
204
+ # filtered_text = filter_keywords(all_text, keywords)
205
+ count = 0
206
+ for item in all_text:
207
+ filename = item["filename"]
208
+ prediction = item["prediction"]
209
+ if keyword.lower()==prediction.lower():
210
+ count+=1
211
+ st.markdown(f"**Filename: {filename}**")
212
+ st.markdown(prediction, unsafe_allow_html=True)
213
+ st.markdown("---")
214
+
215
+ if count==0:
216
+ st.markdown("No match found")
217
+
218
+ elif functionality == "Ask Questions":
219
+
220
+ embeddings = HuggingFaceInstructEmbeddings()
221
+
222
+ new_db = FAISS.load_local("faiss_index_V2", embeddings)
223
+
224
+ st.write(css,unsafe_allow_html=True)
225
+
226
+ # create conversation chain
227
+ # st.session_state.conversation = get_conversation_chain(vector_store)
228
+
229
+ question = st.text_input("Ask Question")
230
+
231
+ if st.button('Ask Question'):
232
+ with st.spinner("Processing"):
233
+ if question:
234
+ # Convert the question to a vector
235
+ question_vector = encode_question(question)
236
+
237
+ # Convert the vector store to a compatible format
238
+ output = new_db.similarity_search_by_vector(question_vector)
239
+ page_content = output[0].page_content
240
+ st.write(page_content)
241
+
242
+ if __name__=='__main__':
243
+ main()
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
model_Responce.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import joblib
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ from keras.utils import pad_sequences
6
+ from keras.preprocessing.text import Tokenizer
7
+
8
+ # Load the model from the pickle file
9
+ # filename = 'F:/CVFilter/models/model_pk.pkl'
10
+ # with open(filename, 'rb') as file:
11
+ # model = pickle.load(file)
12
+
13
+ # Load the saved model
14
+ # model = joblib.load('F:\CVFilter\models\model.joblib')
15
+
16
+ model = tf.keras.models.load_model('F:\CVFilter\models\model.h5')
17
+
18
+ tokenfile = 'F:/CVFilter/tokenized_words/tokenized_words.pkl'
19
+ # Load the tokenized words from the pickle file
20
+ with open(tokenfile, 'rb') as file:
21
+ loaded_tokenized_words = pickle.load(file)
22
+
23
+ max_review_length = 200
24
+ tokenizer = Tokenizer(num_words=10000, #max no. of unique words to keep
25
+ filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
26
+ lower=True #convert to lower case
27
+ )
28
+ tokenizer.fit_on_texts(loaded_tokenized_words)
29
+
30
+ outcome_labels = ['Business Analyst', 'Cyber Security','Data Engineer','Data Science','DevOps','Machine Learning Engineer','Mobile App Developer','Network Engineer','Quality Assurance','Software Engineer']
31
+
32
+ def model_prediction(text, model=model, tokenizer=tokenizer, labels=outcome_labels):
33
+ seq = tokenizer.texts_to_sequences([text])
34
+ padded = pad_sequences(seq, maxlen=max_review_length)
35
+ pred = model.predict(padded)
36
+ # print("Probability distribution: ", pred)
37
+ # print("Field ")
38
+ return labels[np.argmax(pred)]
models/model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc809fc62b4f84621e22ecf8fe9c2af763d9f4fd0f1383c92e1e0a9aaae59674
3
+ size 51959288
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.195
2
+ PyPDF2==3.0.1
3
+ python-dotenv==1.0.0
4
+ streamlit==1.18.1
5
+ faiss-cpu==1.7.4
6
+ streamlit-extras
7
+ altair<5
8
+ pdfminer.six==20221105
9
+ numpy
10
+ keras==2.12.0
11
+ tensorflow==2.12.0
12
+ joblib
13
+ openai
14
+ huggingface_hub
15
+ InstructorEmbedding
16
+ torch
17
+ sentence_transformers