import streamlit as st import os import nltk import json import base64 import pandas as pd import ast os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY") from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings.openai import OpenAIEmbeddings from langchain.document_loaders import UnstructuredPDFLoader from langchain.chat_models import ChatOpenAI from langchain.chains.question_answering import load_qa_chain from PIL import Image from datetime import datetime from tempfile import NamedTemporaryFile import pypdfium2 as pdfium examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"] examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"] def load_image(image_file): img = Image.open(image_file) return img def main(): head1, head2 = st.columns(2) with head1: tula_logo=load_image('tulaco.png') st.image(tula_logo,width=200) with head2: st.write('mail@tula.co') st.write('www.tula.co') st.title("CV parsing with Chat GPT") PDFFileName = '' if not "initialized" in st.session_state: st.session_state.isbutton = False st.session_state.initialized = True uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"]) nltk.download('punkt') nltk.download('averaged_perceptron_tagger') st.subheader("CV examples") col1, col2, col3 = st.columns(3) with col1: ex=load_image(examples[0]) st.image(ex,width=100) if st.button('Example 1'): ex=load_image(examples[0]) img = ex.convert('RGB') loader = UnstructuredPDFLoader(img) img.save('CV.pdf') st.session_state.isbutton=True PDFFileName=examples_pdf[0] with col2: ex1=load_image(examples[1]) st.image(ex1,width=100) if st.button('Example 2'): st.session_state.isbutton=True PDFFileName = examples_pdf[1] with col3: ex2=load_image(examples[2]) st.image(ex2,width=100) if st.button('Example 3'): st.session_state.isbutton=True PDFFileName = examples_pdf[2] if (uploaded_file is not None) and (st.session_state.isbutton==False): file_name, file_extension = os.path.splitext(uploaded_file.name) if file_extension != '.pdf': uploaded_image = Image.open(uploaded_file) img = uploaded_image.convert('RGB') loader = UnstructuredPDFLoader(img) img.save(file_name+'.pdf') PDFFileName = file_name+'.pdf' else: with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f: f.write(uploaded_file.getbuffer()) PDFFileName = f.name if PDFFileName != '': pdf = pdfium.PdfDocument(PDFFileName) n_pages = len(pdf) for page_number in range(n_pages): page = pdf.get_page(page_number) pil_image = page.render(scale=4).to_pil() st.image(pil_image,width=700) with st.spinner('Document parsing in progress ...'): loader = UnstructuredPDFLoader(PDFFileName) pages = loader.load_and_split() embeddings = OpenAIEmbeddings() docsearch = Chroma.from_documents(pages, embeddings).as_retriever() current_date = datetime.now() query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d') docs = docsearch.get_relevant_documents(query) chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff") output = chain.run(input_documents=docs, question=query) st.subheader("Parsing result in JSON format") valid_json = ast.literal_eval(output) st.json(valid_json) json_data = json.loads(json.dumps(valid_json)) names = [json_data.get("full_name", "N/A")] contacts = [json_data.get("contacts", "N/A")] ages = [json_data.get("age", "N/A")] languages = [json_data.get("languages", "N/A")] education = [json_data.get("education", "N/A")] school = [json_data.get("school", "N/A")] works = [json_data.get("work_experience", "N/A")] skills = [json_data.get("skills", "N/A")] df = pd.DataFrame({ "name": names, "contacts": contacts, "age": ages, "languages": languages, "education": education, "school": school, "work": works, "skill": skills }) st.subheader("Parsing result as a table") st.table(df) csv = df.to_csv(index=False).encode('utf-8') download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv') PDFFileName = '' uploaded_file = None st.success("Ready!") if __name__ == "__main__": main()