CV_Parser / app.py
ddovidovich
version 0.2
9b70b81
raw
history blame contribute delete
No virus
5.14 kB
import streamlit as st
import os
import nltk
import json
import base64
import pandas as pd
import ast
os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from PIL import Image
from datetime import datetime
from tempfile import NamedTemporaryFile
import pypdfium2 as pdfium
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]
def load_image(image_file):
img = Image.open(image_file)
return img
def main():
head1, head2 = st.columns(2)
with head1:
tula_logo=load_image('tulaco.png')
st.image(tula_logo,width=200)
with head2:
st.write('mail@tula.co')
st.write('www.tula.co')
st.title("CV parsing with Chat GPT")
PDFFileName = ''
if not "initialized" in st.session_state:
st.session_state.isbutton = False
st.session_state.initialized = True
uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
st.subheader("CV examples")
col1, col2, col3 = st.columns(3)
with col1:
ex=load_image(examples[0])
st.image(ex,width=100)
if st.button('Example 1'):
ex=load_image(examples[0])
img = ex.convert('RGB')
loader = UnstructuredPDFLoader(img)
img.save('CV.pdf')
st.session_state.isbutton=True
PDFFileName=examples_pdf[0]
with col2:
ex1=load_image(examples[1])
st.image(ex1,width=100)
if st.button('Example 2'):
st.session_state.isbutton=True
PDFFileName = examples_pdf[1]
with col3:
ex2=load_image(examples[2])
st.image(ex2,width=100)
if st.button('Example 3'):
st.session_state.isbutton=True
PDFFileName = examples_pdf[2]
if (uploaded_file is not None) and (st.session_state.isbutton==False):
file_name, file_extension = os.path.splitext(uploaded_file.name)
if file_extension != '.pdf':
uploaded_image = Image.open(uploaded_file)
img = uploaded_image.convert('RGB')
loader = UnstructuredPDFLoader(img)
img.save(file_name+'.pdf')
PDFFileName = file_name+'.pdf'
else:
with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
f.write(uploaded_file.getbuffer())
PDFFileName = f.name
if PDFFileName != '':
pdf = pdfium.PdfDocument(PDFFileName)
n_pages = len(pdf)
for page_number in range(n_pages):
page = pdf.get_page(page_number)
pil_image = page.render(scale=4).to_pil()
st.image(pil_image,width=700)
with st.spinner('Document parsing in progress ...'):
loader = UnstructuredPDFLoader(PDFFileName)
pages = loader.load_and_split()
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
current_date = datetime.now()
query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
docs = docsearch.get_relevant_documents(query)
chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
output = chain.run(input_documents=docs, question=query)
st.subheader("Parsing result in JSON format")
valid_json = ast.literal_eval(output)
st.json(valid_json)
json_data = json.loads(json.dumps(valid_json))
names = [json_data.get("full_name", "N/A")]
contacts = [json_data.get("contacts", "N/A")]
ages = [json_data.get("age", "N/A")]
languages = [json_data.get("languages", "N/A")]
education = [json_data.get("education", "N/A")]
school = [json_data.get("school", "N/A")]
works = [json_data.get("work_experience", "N/A")]
skills = [json_data.get("skills", "N/A")]
df = pd.DataFrame({
"name": names,
"contacts": contacts,
"age": ages,
"languages": languages,
"education": education,
"school": school,
"work": works,
"skill": skills
})
st.subheader("Parsing result as a table")
st.table(df)
csv = df.to_csv(index=False).encode('utf-8')
download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
PDFFileName = ''
uploaded_file = None
st.success("Ready!")
if __name__ == "__main__":
main()