Spaces:

KonstantinosKakkavas
/

pdf-invoice-screener

No application file

App Files Files Community

KonstantinosKakkavas commited on Apr 17, 2024

Commit

adbfd9e

verified ·

1 Parent(s): 0486202

first

Browse files

Files changed (5) hide show

My PDF.pdf +0 -0
main.py +16 -0
resume_screening_assistance/app.py +51 -0
resume_screening_assistance/requirements.txt +9 -0
resume_screening_assistance/utils.py +63 -0

My PDF.pdf ADDED Viewed

Binary file (16.6 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# This is a sample Python script.
+# Press Shift+F10 to execute it or replace it with your code.
+# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/

resume_screening_assistance/app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Application that will help the Resume Screener will llms to find the best fits for the job
+import streamlit as st
+import uuid
+from utils import *
+# Creating session variables
+if "unique_id" not in st.session_state:
+    st.session_state["unique_id"] = ''
+def main():
+    st.set_page_config(page_title="Resume Screening Assistance")
+    st.title('HR Resume Screening Assistance')
+    st.subheader('I can help you in resume screening process')
+    job_description = st.text_area("Enter your job description", key="1")
+    document_count = st.text_area("No.of 'Resumes' to return", key="2")
+    # Upload the Resumes (pdf files)
+    pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"], accept_multiple_files=True)
+    submit = st.button("Help me with the analysis")
+    if submit:
+        with st.spinner('Wait for it...'):
+            # Creating a unique id, so that we can use the query
+            # and get only the user uploaded documents from PINECONE vector store
+            st.session_state["unique_id"] = uuid.uuid4().hex
+            # Create a documents list out of all the user uploaded pdf files
+            docs = create_docs(pdf, st.session_state["unique_id"])
+            # Display the count of the docs that were uploaded
+            st.write(len(docs))
+            # Create embeddings instance
+            embeddings = create_embeddings_load_data()
+            # Push data to PINECONE
+            st.write(push_to_pinecone(
+                '63987f25-a66d-448f-8a91-d64a9dd71d9a',
+                "test", embeddings, docs)
+            )
+        st.success('Hope I was able to save your time <3')
+if __name__ == '__main__':
+    main()

resume_screening_assistance/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+langchain
+streamlit
+openai
+tiktoken
+python-dotenv
+unstructured
+pinecone-client
+pypdf
+sentence_transformers

resume_screening_assistance/utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import openai
+from langchain.embeddings.openai import OpenAIEmbeddings
+from pinecone import Pinecone
+from langchain_openai import OpenAI
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from sentence_transformers import SentenceTransformer
+from langchain.chains.summarize import load_summarize_chain
+from langchain import HuggingFaceHub
+from PyPDF2 import PdfReader
+from langchain.schema import Document
+def get_pdf_text(pdf_doc):
+    """ Extract text from pdf file """
+    text = ""
+    pdf_reader = PdfReader(pdf_doc)
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def create_docs(user_pdf_list, unique_id):
+    """ Iterate over PDF files that user uploaded one by one"""
+    docs = []
+    for filename in user_pdf_list:
+        chunks = get_pdf_text(filename)
+        docs.append(Document(page_content=chunks,
+                             metadata={"name": filename.name,
+                                       # "id": filename.id, todo error here because not all files have id as it seems...
+                                       "type": filename.type,
+                                       "size": filename.size, "unique_id": unique_id}))
+    return docs
+def create_embeddings_load_data():
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    # Sentences are encoded by calling model.encode()
+    return model
+def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]):
+    """function to push data to Vector database"""
+    pc = Pinecone(api_key=pinecone_apikey)
+    index = pc.Index(pinecone_index_name)
+    # # Transform documents to vectors before upserting
+    # vector_data = {}
+    # for doc in docs:
+    #     # Assuming each doc is an instance of langchain.schema.Document
+    #     # Extract the text content and convert to embedding
+    #     vector = embeddings.encode(doc.page_content)
+    #     # Use unique_id as key and vector as value
+    #     vector_data[doc.metadata['unique_id']] = vector
+    for doc in docs:
+        doc.page_content = embeddings.encode(doc.page_content)
+    # content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"]
+    return docs
+    # index.upsert(embeddings.encode([doc.page_content for doc in docs]))
+# def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]):
+#     if