Spaces:

fazni
/

Resume-filter-plus-QA-documents

Running

App Files Files Community

fazni commited on Jun 19, 2023

Commit

b06ff0c

1 Parent(s): 2a4d161

added app.py file with all other files

Browse files

Files changed (7) hide show

FindKeyword.py +11 -0
PreprocessText.py +28 -0
app.py +243 -0
htmlTemplates.py +44 -0
model_Responce.py +38 -0
models/model.h5 +3 -0
requirements.txt +17 -0

FindKeyword.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import re
+def FindKeyWords(keywords, text):
+    highlighted_text = text
+    for keyword in keywords:
+        if re.search(r'\b({0})\b'.format(re.escape(keyword)), highlighted_text, flags=re.IGNORECASE):
+            highlighted_text = re.sub(r'\b({0})\b'.format(re.escape(keyword)), r'<mark style="background-color: yellow;">\1</mark>', highlighted_text, flags=re.IGNORECASE)
+        else:
+            return "Keyword not found in the Resume."
+    return highlighted_text

PreprocessText.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import re
+def preprocess_text(text):
+    # Remove newlines and tabs
+    text = re.sub(r'\n|\t', '', text)
+    # Remove letter combinations between spaces
+    text = re.sub(r'\s[A-Z]\s', ' ', text)
+    # Remove emails
+    text = re.sub(r'\S+@\S+', '', text)
+    # Remove dates in the format DD-MM-YYYY or DD/MM/YYYY
+    text = re.sub(r'\d{2}[-/]\d{2}[-/]\d{4}', '', text)
+    # Remove phone numbers
+    text = re.sub(r'\+\d{2}\s?\d{2,3}\s?\d{3,4}\s?\d{4}', '', text)
+    # Remove specific text format
+    text = re.sub(r'Issued\s\w+\s\d{4}Credential ID \w+', '', text)
+    # Remove extra spaces between words
+    text = re.sub(r'\s+', ' ', text)
+    # Add a space before a word containing a capital letter in the middle
+    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
+    return text

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import re
+import streamlit as st
+from PyPDF2 import PdfReader
+from dotenv import load_dotenv
+from FindKeyword import FindKeyWords
+from PreprocessText import preprocess_text
+from model_Responce import model_prediction
+from streamlit_extras.add_vertical_space import add_vertical_space
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+# from langchain.chat_models import ChatOpenAI
+# from langchain.memory import ConversationBufferMemory
+# from langchain.chains import ConversationalRetrievalChain
+from htmlTemplates import css, bot_template, user_template
+from InstructorEmbedding import INSTRUCTOR
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+# Assuming this function encodes the question into a vector representation
+def encode_question(question):
+    embeddings = HuggingFaceInstructEmbeddings()  # Instantiate the embeddings model
+    question_vector = embeddings.embed_query(question)  # Encode the question into a vector
+    return question_vector
+# def handle_user_input(question):
+#     response = st.session_state.conversation({'question':question})
+#     st.session_state.chat_history = response('chat_history')
+#     for i,message in enumerate(st.session_state.chat_history):
+#         if i % 2 == 0:
+#             st.write(user_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
+#         else:
+#             st.write(bot_template.replace("{{MSG}}",message.content),unsafe_allow_html=True)
+# def get_conversation_chain(vector_store):
+#     llm = ChatOpenAI()
+#     memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
+#     conversation_chain = ConversationalRetrievalChain.from_llm(
+#         llm=llm,
+#         retriever=vector_store.as_retriever(),
+#         memory = memory
+#     )
+#     return conversation_chain
+def save_vector_store(text_chunks):
+    # embeddings = OpenAIEmbeddings()
+    # model = INSTRUCTOR('hkunlp/instructor-base')
+    # embeddings = model.encode(raw_text)
+    embeddings = HuggingFaceInstructEmbeddings()
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    new_db = FAISS.load_local("faiss_index_V2", embeddings)
+    new_db.merge_from(vectorstore)
+    new_db.save_local('faiss_index_V2')
+    return st.write("vector Store is Saved")
+def button_function(all_text):
+    # Add your desired functionality here
+    # predictions = []
+    for item in all_text:
+        text = item['text']
+        # filename = item['filename']
+        pred = model_prediction(text)
+        # predictions.append({"filename": filename, "prediction": pred})
+        item['prediction'] = pred
+    return all_text
+def get_pdf_text(pdfs,preprocess=True):
+    if preprocess:
+        all_text = []
+        for pdf in pdfs:
+            # Process each uploaded PDF file
+            # Reading PDF
+            pdf_reader = PdfReader(pdf)
+            # Get the filename of the PDF
+            filename = pdf.name
+            text = ""
+            # Reading Each Page
+            for page in pdf_reader.pages:
+                # Extracting Text in Every Page
+                text += page.extract_text()
+            # Preprocess the text
+            text = preprocess_text(text)
+            # Appending to array
+            all_text.append({"filename": filename, "text": text})
+        return all_text
+    else:
+        text = ""
+        for pdf in pdfs:
+            # Process each uploaded PDF file
+            # Reading PDF
+            pdf_reader = PdfReader(pdf)
+            # Reading Each Page
+            for page in pdf_reader.pages:
+                # Extracting Text in Every Page
+                text += page.extract_text()
+        # text = preprocess_text(text)
+        return text
+def filter_keywords(all_text, keywords):
+    filtered_text = []
+    for item in all_text:
+        filename = item['filename']
+        text = item['text']
+        filtered_text_with_keywords = FindKeyWords(keywords, text)
+        filtered_text.append({"filename": filename, "text": filtered_text_with_keywords})
+    return filtered_text
+# Main body
+def main():
+    # vector_store = None
+    load_dotenv()
+    st.header("Resume Filter using Keywords 💬")
+    # Sidebar contents
+    with st.sidebar:
+        st.title('🤗💬 LLM Chat App')
+        # upload a PDF file
+        pdfs = st.file_uploader("Upload your Resumes", type='pdf',accept_multiple_files=True)
+        # Get user preference for matching keywords
+        # match_all_keywords = st.checkbox("Match All Keywords")
+        # Choose functionality: Prediction or Filtering
+        functionality = st.radio("Choose functionality:", ("Make Predictions", "Filter Keywords","Predict the Suitable canditate","Ask Questions"))
+        if functionality == "Ask Questions":
+            if st.button('Process'):
+                with st.spinner("Processing"):
+                    # get pdf text
+                    raw_text = get_pdf_text(pdfs, preprocess=False)
+                    # get the text chunk
+                    text_chunks = get_text_chunks(raw_text)
+                    # create vector store
+                    save_vector_store(text_chunks)
+        add_vertical_space(5)
+        st.write('Made with ❤️ by Fazni Farook')
+    if pdfs is not None:
+        all_text = get_pdf_text(pdfs)
+        # if 'conversation' not in st.session_state:
+        #     st.session_state.conversation = None
+        # if 'chat_history' not in st.session_state:
+        #     st.session_state.chat_history = None
+        if functionality == "Make Predictions":
+            if st.button('Make Prediction'):
+                with st.spinner("Progressing"):
+                    all_text = button_function(all_text)
+                    for item in all_text:
+                        filename = item["filename"]
+                        text = item["text"]
+                        pred = item["prediction"]
+                        st.markdown(f"**Filename: {filename}**")
+                        # st.markdown(text, unsafe_allow_html=True)
+                        st.markdown(f"**Prediction: {pred}**")
+                        st.markdown("---")
+        elif functionality == "Filter Keywords":
+            # getting the keywords
+            keyword_input  = st.text_input("Keyword")
+            keywords = [keyword.strip() for keyword in keyword_input.split(",")]
+            if st.button('Filter Keywords'):
+                with st.spinner("Progressing"):
+                    filtered_text = filter_keywords(all_text, keywords)
+                    for item in filtered_text:
+                        filename = item["filename"]
+                        text = item["text"]
+                        st.markdown(f"**Filename: {filename}**")
+                        st.markdown(text, unsafe_allow_html=True)
+                        st.markdown("---")
+        elif functionality == "Predict the Suitable canditate":
+            # getting the keywords
+            keyword  = st.text_input("Keyword")
+            if st.button('Filter Resumes'):
+                with st.spinner("Progressing"):
+                    all_text = button_function(all_text)
+                    # filtered_text = filter_keywords(all_text, keywords)
+                    count = 0
+                    for item in all_text:
+                        filename = item["filename"]
+                        prediction = item["prediction"]
+                        if keyword.lower()==prediction.lower():
+                            count+=1
+                            st.markdown(f"**Filename: {filename}**")
+                            st.markdown(prediction, unsafe_allow_html=True)
+                            st.markdown("---")
+                    if count==0:
+                        st.markdown("No match found")
+        elif functionality == "Ask Questions":
+            embeddings = HuggingFaceInstructEmbeddings()
+            new_db = FAISS.load_local("faiss_index_V2", embeddings)
+            st.write(css,unsafe_allow_html=True)
+            # create conversation chain
+            # st.session_state.conversation = get_conversation_chain(vector_store)
+            question = st.text_input("Ask Question")
+            if st.button('Ask Question'):
+                with st.spinner("Processing"):
+                    if question:
+                        # Convert the question to a vector
+                        question_vector = encode_question(question)
+                        # Convert the vector store to a compatible format
+                        output = new_db.similarity_search_by_vector(question_vector)
+                        page_content = output[0].page_content
+                        st.write(page_content)
+if __name__=='__main__':
+    main()

htmlTemplates.py ADDED Viewed

	@@ -0,0 +1,44 @@

+css = '''
+<style>
+.chat-message {
+    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
+}
+.chat-message.user {
+    background-color: #2b313e
+}
+.chat-message.bot {
+    background-color: #475063
+}
+.chat-message .avatar {
+  width: 20%;
+}
+.chat-message .avatar img {
+  max-width: 78px;
+  max-height: 78px;
+  border-radius: 50%;
+  object-fit: cover;
+}
+.chat-message .message {
+  width: 80%;
+  padding: 0 1.5rem;
+  color: #fff;
+}
+'''
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''

model_Responce.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import pickle
+import joblib
+import numpy as np
+import tensorflow as tf
+from keras.utils import pad_sequences
+from keras.preprocessing.text import Tokenizer
+# Load the model from the pickle file
+# filename = 'F:/CVFilter/models/model_pk.pkl'
+# with open(filename, 'rb') as file:
+#     model = pickle.load(file)
+# Load the saved model
+# model = joblib.load('F:\CVFilter\models\model.joblib')
+model = tf.keras.models.load_model('F:\CVFilter\models\model.h5')
+tokenfile = 'F:/CVFilter/tokenized_words/tokenized_words.pkl'
+# Load the tokenized words from the pickle file
+with open(tokenfile, 'rb') as file:
+    loaded_tokenized_words = pickle.load(file)
+max_review_length = 200
+tokenizer = Tokenizer(num_words=10000,  #max no. of unique words to keep
+                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
+                      lower=True #convert to lower case
+                     )
+tokenizer.fit_on_texts(loaded_tokenized_words)
+outcome_labels = ['Business Analyst', 'Cyber Security','Data Engineer','Data Science','DevOps','Machine Learning Engineer','Mobile App Developer','Network Engineer','Quality Assurance','Software Engineer']
+def model_prediction(text, model=model, tokenizer=tokenizer, labels=outcome_labels):
+    seq = tokenizer.texts_to_sequences([text])
+    padded = pad_sequences(seq, maxlen=max_review_length)
+    pred = model.predict(padded)
+    # print("Probability distribution: ", pred)
+    # print("Field ")
+    return labels[np.argmax(pred)]

models/model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc809fc62b4f84621e22ecf8fe9c2af763d9f4fd0f1383c92e1e0a9aaae59674
+size 51959288

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+langchain==0.0.195
+PyPDF2==3.0.1
+python-dotenv==1.0.0
+streamlit==1.18.1
+faiss-cpu==1.7.4
+streamlit-extras
+altair<5
+pdfminer.six==20221105
+numpy
+keras==2.12.0
+tensorflow==2.12.0
+joblib
+openai
+huggingface_hub
+InstructorEmbedding
+torch
+sentence_transformers