Spaces:
No application file
No application file
KonstantinosKakkavas
commited on
first
Browse files- My PDF.pdf +0 -0
- main.py +16 -0
- resume_screening_assistance/app.py +51 -0
- resume_screening_assistance/requirements.txt +9 -0
- resume_screening_assistance/utils.py +63 -0
My PDF.pdf
ADDED
Binary file (16.6 kB). View file
|
|
main.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a sample Python script.
|
2 |
+
|
3 |
+
# Press Shift+F10 to execute it or replace it with your code.
|
4 |
+
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
|
5 |
+
|
6 |
+
|
7 |
+
def print_hi(name):
|
8 |
+
# Use a breakpoint in the code line below to debug your script.
|
9 |
+
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
|
10 |
+
|
11 |
+
|
12 |
+
# Press the green button in the gutter to run the script.
|
13 |
+
if __name__ == '__main__':
|
14 |
+
print_hi('PyCharm')
|
15 |
+
|
16 |
+
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|
resume_screening_assistance/app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Application that will help the Resume Screener will llms to find the best fits for the job
|
2 |
+
import streamlit as st
|
3 |
+
import uuid
|
4 |
+
|
5 |
+
from utils import *
|
6 |
+
|
7 |
+
# Creating session variables
|
8 |
+
if "unique_id" not in st.session_state:
|
9 |
+
st.session_state["unique_id"] = ''
|
10 |
+
|
11 |
+
|
12 |
+
def main():
|
13 |
+
st.set_page_config(page_title="Resume Screening Assistance")
|
14 |
+
st.title('HR Resume Screening Assistance')
|
15 |
+
st.subheader('I can help you in resume screening process')
|
16 |
+
|
17 |
+
job_description = st.text_area("Enter your job description", key="1")
|
18 |
+
document_count = st.text_area("No.of 'Resumes' to return", key="2")
|
19 |
+
|
20 |
+
# Upload the Resumes (pdf files)
|
21 |
+
pdf = st.file_uploader("Upload resumes here, only PDF files allowed", type=["pdf"], accept_multiple_files=True)
|
22 |
+
|
23 |
+
submit = st.button("Help me with the analysis")
|
24 |
+
|
25 |
+
if submit:
|
26 |
+
with st.spinner('Wait for it...'):
|
27 |
+
# Creating a unique id, so that we can use the query
|
28 |
+
# and get only the user uploaded documents from PINECONE vector store
|
29 |
+
st.session_state["unique_id"] = uuid.uuid4().hex
|
30 |
+
|
31 |
+
# Create a documents list out of all the user uploaded pdf files
|
32 |
+
docs = create_docs(pdf, st.session_state["unique_id"])
|
33 |
+
|
34 |
+
# Display the count of the docs that were uploaded
|
35 |
+
st.write(len(docs))
|
36 |
+
|
37 |
+
# Create embeddings instance
|
38 |
+
embeddings = create_embeddings_load_data()
|
39 |
+
|
40 |
+
# Push data to PINECONE
|
41 |
+
|
42 |
+
st.write(push_to_pinecone(
|
43 |
+
'63987f25-a66d-448f-8a91-d64a9dd71d9a',
|
44 |
+
"test", embeddings, docs)
|
45 |
+
)
|
46 |
+
|
47 |
+
st.success('Hope I was able to save your time <3')
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == '__main__':
|
51 |
+
main()
|
resume_screening_assistance/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
streamlit
|
3 |
+
openai
|
4 |
+
tiktoken
|
5 |
+
python-dotenv
|
6 |
+
unstructured
|
7 |
+
pinecone-client
|
8 |
+
pypdf
|
9 |
+
sentence_transformers
|
resume_screening_assistance/utils.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
from pinecone import Pinecone
|
4 |
+
from langchain_openai import OpenAI
|
5 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from langchain.chains.summarize import load_summarize_chain
|
8 |
+
from langchain import HuggingFaceHub
|
9 |
+
from PyPDF2 import PdfReader
|
10 |
+
from langchain.schema import Document
|
11 |
+
|
12 |
+
|
13 |
+
def get_pdf_text(pdf_doc):
|
14 |
+
""" Extract text from pdf file """
|
15 |
+
text = ""
|
16 |
+
pdf_reader = PdfReader(pdf_doc)
|
17 |
+
for page in pdf_reader.pages:
|
18 |
+
text += page.extract_text()
|
19 |
+
return text
|
20 |
+
|
21 |
+
|
22 |
+
def create_docs(user_pdf_list, unique_id):
|
23 |
+
""" Iterate over PDF files that user uploaded one by one"""
|
24 |
+
docs = []
|
25 |
+
for filename in user_pdf_list:
|
26 |
+
chunks = get_pdf_text(filename)
|
27 |
+
docs.append(Document(page_content=chunks,
|
28 |
+
metadata={"name": filename.name,
|
29 |
+
# "id": filename.id, todo error here because not all files have id as it seems...
|
30 |
+
"type": filename.type,
|
31 |
+
"size": filename.size, "unique_id": unique_id}))
|
32 |
+
return docs
|
33 |
+
|
34 |
+
|
35 |
+
def create_embeddings_load_data():
|
36 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
37 |
+
# Sentences are encoded by calling model.encode()
|
38 |
+
return model
|
39 |
+
|
40 |
+
|
41 |
+
def push_to_pinecone(pinecone_apikey, pinecone_index_name, embeddings: SentenceTransformer, docs: list[Document]):
|
42 |
+
"""function to push data to Vector database"""
|
43 |
+
pc = Pinecone(api_key=pinecone_apikey)
|
44 |
+
index = pc.Index(pinecone_index_name)
|
45 |
+
|
46 |
+
# # Transform documents to vectors before upserting
|
47 |
+
# vector_data = {}
|
48 |
+
# for doc in docs:
|
49 |
+
# # Assuming each doc is an instance of langchain.schema.Document
|
50 |
+
# # Extract the text content and convert to embedding
|
51 |
+
# vector = embeddings.encode(doc.page_content)
|
52 |
+
# # Use unique_id as key and vector as value
|
53 |
+
# vector_data[doc.metadata['unique_id']] = vector
|
54 |
+
|
55 |
+
for doc in docs:
|
56 |
+
doc.page_content = embeddings.encode(doc.page_content)
|
57 |
+
# content = ["ko ot", "ko ot", "ko ot", "ko ot", "ko ot"]
|
58 |
+
return docs
|
59 |
+
|
60 |
+
# index.upsert(embeddings.encode([doc.page_content for doc in docs]))
|
61 |
+
|
62 |
+
# def pull_from_pinecone(pinecone_apikey, pinecone_index_name, docs: list[Document]):
|
63 |
+
# if
|