Spaces:
Runtime error
Runtime error
File size: 5,143 Bytes
82c7530 22b0051 ac438eb cdf20ae de0702c 2e95f21 eac2263 22b0051 75539bd eac2263 9b70b81 22b0051 797f3b1 9b70b81 22b0051 9b70b81 22b0051 18baae6 22b0051 9b70b81 5ae3ab6 e581254 5ae3ab6 9b70b81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
import os
import nltk
import json
import base64
import pandas as pd
import ast
os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from PIL import Image
from datetime import datetime
from tempfile import NamedTemporaryFile
import pypdfium2 as pdfium
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]
def load_image(image_file):
img = Image.open(image_file)
return img
def main():
head1, head2 = st.columns(2)
with head1:
tula_logo=load_image('tulaco.png')
st.image(tula_logo,width=200)
with head2:
st.write('mail@tula.co')
st.write('www.tula.co')
st.title("CV parsing with Chat GPT")
PDFFileName = ''
if not "initialized" in st.session_state:
st.session_state.isbutton = False
st.session_state.initialized = True
uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
st.subheader("CV examples")
col1, col2, col3 = st.columns(3)
with col1:
ex=load_image(examples[0])
st.image(ex,width=100)
if st.button('Example 1'):
ex=load_image(examples[0])
img = ex.convert('RGB')
loader = UnstructuredPDFLoader(img)
img.save('CV.pdf')
st.session_state.isbutton=True
PDFFileName=examples_pdf[0]
with col2:
ex1=load_image(examples[1])
st.image(ex1,width=100)
if st.button('Example 2'):
st.session_state.isbutton=True
PDFFileName = examples_pdf[1]
with col3:
ex2=load_image(examples[2])
st.image(ex2,width=100)
if st.button('Example 3'):
st.session_state.isbutton=True
PDFFileName = examples_pdf[2]
if (uploaded_file is not None) and (st.session_state.isbutton==False):
file_name, file_extension = os.path.splitext(uploaded_file.name)
if file_extension != '.pdf':
uploaded_image = Image.open(uploaded_file)
img = uploaded_image.convert('RGB')
loader = UnstructuredPDFLoader(img)
img.save(file_name+'.pdf')
PDFFileName = file_name+'.pdf'
else:
with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
f.write(uploaded_file.getbuffer())
PDFFileName = f.name
if PDFFileName != '':
pdf = pdfium.PdfDocument(PDFFileName)
n_pages = len(pdf)
for page_number in range(n_pages):
page = pdf.get_page(page_number)
pil_image = page.render(scale=4).to_pil()
st.image(pil_image,width=700)
with st.spinner('Document parsing in progress ...'):
loader = UnstructuredPDFLoader(PDFFileName)
pages = loader.load_and_split()
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
current_date = datetime.now()
query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
docs = docsearch.get_relevant_documents(query)
chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
output = chain.run(input_documents=docs, question=query)
st.subheader("Parsing result in JSON format")
valid_json = ast.literal_eval(output)
st.json(valid_json)
json_data = json.loads(json.dumps(valid_json))
names = [json_data.get("full_name", "N/A")]
contacts = [json_data.get("contacts", "N/A")]
ages = [json_data.get("age", "N/A")]
languages = [json_data.get("languages", "N/A")]
education = [json_data.get("education", "N/A")]
school = [json_data.get("school", "N/A")]
works = [json_data.get("work_experience", "N/A")]
skills = [json_data.get("skills", "N/A")]
df = pd.DataFrame({
"name": names,
"contacts": contacts,
"age": ages,
"languages": languages,
"education": education,
"school": school,
"work": works,
"skill": skills
})
st.subheader("Parsing result as a table")
st.table(df)
csv = df.to_csv(index=False).encode('utf-8')
download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
PDFFileName = ''
uploaded_file = None
st.success("Ready!")
if __name__ == "__main__":
main()
|