KonstantinosKakkavas commited on
Commit
adbfd9e
·
verified ·
1 Parent(s): 0486202
My PDF.pdf ADDED
Binary file (16.6 kB). View file
 
main.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a sample Python script.
2
+
3
+ # Press Shift+F10 to execute it or replace it with your code.
4
+ # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
5
+
6
+
7
+ def print_hi(name):
8
+ # Use a breakpoint in the code line below to debug your script.
9
+ print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
10
+
11
+
12
+ # Press the green button in the gutter to run the script.
13
+ if __name__ == '__main__':
14
+ print_hi('PyCharm')
15
+
16
+ # See PyCharm help at https://www.jetbrains.com/help/pycharm/
resume_screening_assistance/app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Application that will help the Resume Screener will llms to find the best fits for the job
2
+ import streamlit as st
3
+ import uuid
4
+
5
+ from utils import *
6
+
7
+ # Creating session variables
8
+ if "unique_id" not in st.session_state:
9
+ st.session_state["unique_id"] = ''
10
+
11
+
12
+ def main():
13
+ st.set_page_config(page_title="Resume Screening Assistance")
14
+ st.title('HR Resume Screening Assistance')
15
+ st.subheader('I can help you in resume screening process')
16
+
17
+ job_description = st.text_area("Enter your job description", key="1")
18
+ document_count = st.text_area("No.of 'Resumes' to return", key="2")
19
+
20
+ # Upload the Resumes (pdf files)
21
+ pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"], accept_multiple_files=True)
22
+
23
+ submit = st.button("Help me with the analysis")
24
+
25
+ if submit:
26
+ with st.spinner('Wait for it...'):
27
+ # Creating a unique id, so that we can use the query
28
+ # and get only the user uploaded documents from PINECONE vector store
29
+ st.session_state["unique_id"] = uuid.uuid4().hex
30
+
31
+ # Create a documents list out of all the user uploaded pdf files
32
+ docs = create_docs(pdf, st.session_state["unique_id"])
33
+
34
+ # Display the count of the docs that were uploaded
35
+ st.write(len(docs))
36
+
37
+ # Create embeddings instance
38
+ embeddings = create_embeddings_load_data()
39
+
40
+ # Push data to PINECONE
41
+
42
+ st.write(push_to_pinecone(
43
+ '63987f25-a66d-448f-8a91-d64a9dd71d9a',
44
+ "test", embeddings, docs)
45
+ )
46
+
47
+ st.success('Hope I was able to save your time <3')
48
+
49
+
50
+ if __name__ == '__main__':
51
+ main()
resume_screening_assistance/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ streamlit
3
+ openai
4
+ tiktoken
5
+ python-dotenv
6
+ unstructured
7
+ pinecone-client
8
+ pypdf
9
+ sentence_transformers
resume_screening_assistance/utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from pinecone import Pinecone
4
+ from langchain_openai import OpenAI
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ from sentence_transformers import SentenceTransformer
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain import HuggingFaceHub
9
+ from PyPDF2 import PdfReader
10
+ from langchain.schema import Document
11
+
12
+
13
+ def get_pdf_text(pdf_doc):
14
+ """ Extract text from pdf file """
15
+ text = ""
16
+ pdf_reader = PdfReader(pdf_doc)
17
+ for page in pdf_reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+
22
+ def create_docs(user_pdf_list, unique_id):
23
+ """ Iterate over PDF files that user uploaded one by one"""
24
+ docs = []
25
+ for filename in user_pdf_list:
26
+ chunks = get_pdf_text(filename)
27
+ docs.append(Document(page_content=chunks,
28
+ metadata={"name": filename.name,
29
+ # "id": filename.id, todo error here because not all files have id as it seems...
30
+ "type": filename.type,
31
+ "size": filename.size, "unique_id": unique_id}))
32
+ return docs
33
+
34
+
35
+ def create_embeddings_load_data():
36
+ model = SentenceTransformer("all-MiniLM-L6-v2")
37
+ # Sentences are encoded by calling model.encode()
38
+ return model
39
+
40
+
41
+ def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]):
42
+ """function to push data to Vector database"""
43
+ pc = Pinecone(api_key=pinecone_apikey)
44
+ index = pc.Index(pinecone_index_name)
45
+
46
+ # # Transform documents to vectors before upserting
47
+ # vector_data = {}
48
+ # for doc in docs:
49
+ # # Assuming each doc is an instance of langchain.schema.Document
50
+ # # Extract the text content and convert to embedding
51
+ # vector = embeddings.encode(doc.page_content)
52
+ # # Use unique_id as key and vector as value
53
+ # vector_data[doc.metadata['unique_id']] = vector
54
+
55
+ for doc in docs:
56
+ doc.page_content = embeddings.encode(doc.page_content)
57
+ # content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"]
58
+ return docs
59
+
60
+ # index.upsert(embeddings.encode([doc.page_content for doc in docs]))
61
+
62
+ # def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]):
63
+ # if