Spaces:

DzmitryXXL
/

CV_Parser

Runtime error

CV_Parser / app.py

ddovidovich

version 0.2

9b70b81 about 2 years ago

5.14 kB

	import streamlit as st
	import os
	import nltk
	import json
	import base64
	import pandas as pd
	import ast

	os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")

	from langchain.document_loaders import PyPDFLoader
	from langchain.vectorstores import Chroma
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.document_loaders import UnstructuredPDFLoader
	from langchain.chat_models import ChatOpenAI
	from langchain.chains.question_answering import load_qa_chain
	from PIL import Image
	from datetime import datetime
	from tempfile import NamedTemporaryFile
	import pypdfium2 as pdfium

	examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
	examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]


	def load_image(image_file):
	img = Image.open(image_file)
	return img

	def main():
	head1, head2 = st.columns(2)
	with head1:
	tula_logo=load_image('tulaco.png')
	st.image(tula_logo,width=200)
	with head2:
	st.write('mail@tula.co')
	st.write('www.tula.co')
	st.title("CV parsing with Chat GPT")
	PDFFileName = ''

	if not "initialized" in st.session_state:
	st.session_state.isbutton = False
	st.session_state.initialized = True

	uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')

	st.subheader("CV examples")
	col1, col2, col3 = st.columns(3)
	with col1:
	ex=load_image(examples[0])
	st.image(ex,width=100)
	if st.button('Example 1'):
	ex=load_image(examples[0])
	img = ex.convert('RGB')
	loader = UnstructuredPDFLoader(img)
	img.save('CV.pdf')
	st.session_state.isbutton=True
	PDFFileName=examples_pdf[0]

	with col2:
	ex1=load_image(examples[1])
	st.image(ex1,width=100)
	if st.button('Example 2'):
	st.session_state.isbutton=True
	PDFFileName = examples_pdf[1]

	with col3:
	ex2=load_image(examples[2])
	st.image(ex2,width=100)
	if st.button('Example 3'):
	st.session_state.isbutton=True
	PDFFileName = examples_pdf[2]

	if (uploaded_file is not None) and (st.session_state.isbutton==False):
	file_name, file_extension = os.path.splitext(uploaded_file.name)

	if file_extension != '.pdf':
	uploaded_image = Image.open(uploaded_file)
	img = uploaded_image.convert('RGB')
	loader = UnstructuredPDFLoader(img)
	img.save(file_name+'.pdf')
	PDFFileName = file_name+'.pdf'
	else:
	with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
	f.write(uploaded_file.getbuffer())
	PDFFileName = f.name

	if PDFFileName != '':
	pdf = pdfium.PdfDocument(PDFFileName)
	n_pages = len(pdf)
	for page_number in range(n_pages):
	page = pdf.get_page(page_number)
	pil_image = page.render(scale=4).to_pil()
	st.image(pil_image,width=700)

	with st.spinner('Document parsing in progress ...'):
	loader = UnstructuredPDFLoader(PDFFileName)
	pages = loader.load_and_split()
	embeddings = OpenAIEmbeddings()
	docsearch = Chroma.from_documents(pages, embeddings).as_retriever()

	current_date = datetime.now()
	query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
	docs = docsearch.get_relevant_documents(query)
	chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
	output = chain.run(input_documents=docs, question=query)
	st.subheader("Parsing result in JSON format")
	valid_json = ast.literal_eval(output)
	st.json(valid_json)

	json_data = json.loads(json.dumps(valid_json))

	names = [json_data.get("full_name", "N/A")]
	contacts = [json_data.get("contacts", "N/A")]
	ages = [json_data.get("age", "N/A")]
	languages = [json_data.get("languages", "N/A")]
	education = [json_data.get("education", "N/A")]
	school = [json_data.get("school", "N/A")]
	works = [json_data.get("work_experience", "N/A")]
	skills = [json_data.get("skills", "N/A")]

	df = pd.DataFrame({
	"name": names,
	"contacts": contacts,
	"age": ages,
	"languages": languages,
	"education": education,
	"school": school,
	"work": works,
	"skill": skills
	})
	st.subheader("Parsing result as a table")
	st.table(df)
	csv = df.to_csv(index=False).encode('utf-8')
	download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
	PDFFileName = ''
	uploaded_file = None
	st.success("Ready!")


	if __name__ == "__main__":
	main()