Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
import nltk | |
import json | |
import base64 | |
import pandas as pd | |
import ast | |
os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY") | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.document_loaders import UnstructuredPDFLoader | |
from langchain.chat_models import ChatOpenAI | |
from langchain.chains.question_answering import load_qa_chain | |
from PIL import Image | |
from datetime import datetime | |
from tempfile import NamedTemporaryFile | |
import pypdfium2 as pdfium | |
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"] | |
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"] | |
def load_image(image_file): | |
img = Image.open(image_file) | |
return img | |
def main(): | |
head1, head2 = st.columns(2) | |
with head1: | |
tula_logo=load_image('tulaco.png') | |
st.image(tula_logo,width=200) | |
with head2: | |
st.write('mail@tula.co') | |
st.write('www.tula.co') | |
st.title("CV parsing with Chat GPT") | |
PDFFileName = '' | |
if not "initialized" in st.session_state: | |
st.session_state.isbutton = False | |
st.session_state.initialized = True | |
uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"]) | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
st.subheader("CV examples") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
ex=load_image(examples[0]) | |
st.image(ex,width=100) | |
if st.button('Example 1'): | |
ex=load_image(examples[0]) | |
img = ex.convert('RGB') | |
loader = UnstructuredPDFLoader(img) | |
img.save('CV.pdf') | |
st.session_state.isbutton=True | |
PDFFileName=examples_pdf[0] | |
with col2: | |
ex1=load_image(examples[1]) | |
st.image(ex1,width=100) | |
if st.button('Example 2'): | |
st.session_state.isbutton=True | |
PDFFileName = examples_pdf[1] | |
with col3: | |
ex2=load_image(examples[2]) | |
st.image(ex2,width=100) | |
if st.button('Example 3'): | |
st.session_state.isbutton=True | |
PDFFileName = examples_pdf[2] | |
if (uploaded_file is not None) and (st.session_state.isbutton==False): | |
file_name, file_extension = os.path.splitext(uploaded_file.name) | |
if file_extension != '.pdf': | |
uploaded_image = Image.open(uploaded_file) | |
img = uploaded_image.convert('RGB') | |
loader = UnstructuredPDFLoader(img) | |
img.save(file_name+'.pdf') | |
PDFFileName = file_name+'.pdf' | |
else: | |
with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f: | |
f.write(uploaded_file.getbuffer()) | |
PDFFileName = f.name | |
if PDFFileName != '': | |
pdf = pdfium.PdfDocument(PDFFileName) | |
n_pages = len(pdf) | |
for page_number in range(n_pages): | |
page = pdf.get_page(page_number) | |
pil_image = page.render(scale=4).to_pil() | |
st.image(pil_image,width=700) | |
with st.spinner('Document parsing in progress ...'): | |
loader = UnstructuredPDFLoader(PDFFileName) | |
pages = loader.load_and_split() | |
embeddings = OpenAIEmbeddings() | |
docsearch = Chroma.from_documents(pages, embeddings).as_retriever() | |
current_date = datetime.now() | |
query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d') | |
docs = docsearch.get_relevant_documents(query) | |
chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff") | |
output = chain.run(input_documents=docs, question=query) | |
st.subheader("Parsing result in JSON format") | |
valid_json = ast.literal_eval(output) | |
st.json(valid_json) | |
json_data = json.loads(json.dumps(valid_json)) | |
names = [json_data.get("full_name", "N/A")] | |
contacts = [json_data.get("contacts", "N/A")] | |
ages = [json_data.get("age", "N/A")] | |
languages = [json_data.get("languages", "N/A")] | |
education = [json_data.get("education", "N/A")] | |
school = [json_data.get("school", "N/A")] | |
works = [json_data.get("work_experience", "N/A")] | |
skills = [json_data.get("skills", "N/A")] | |
df = pd.DataFrame({ | |
"name": names, | |
"contacts": contacts, | |
"age": ages, | |
"languages": languages, | |
"education": education, | |
"school": school, | |
"work": works, | |
"skill": skills | |
}) | |
st.subheader("Parsing result as a table") | |
st.table(df) | |
csv = df.to_csv(index=False).encode('utf-8') | |
download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv') | |
PDFFileName = '' | |
uploaded_file = None | |
st.success("Ready!") | |
if __name__ == "__main__": | |
main() | |