Spaces:

DzmitryXXL
/

CV_Parser

Runtime error

File size: 5,143 Bytes

import streamlit as st
import os 
import nltk
import json
import base64
import pandas as pd
import ast

os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")

from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain   
from PIL import Image
from datetime import datetime
from tempfile import NamedTemporaryFile
import pypdfium2 as pdfium
    
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]


def load_image(image_file):
	img = Image.open(image_file)
	return img

def main():
  head1, head2 = st.columns(2)
  with head1:
    tula_logo=load_image('tulaco.png')
    st.image(tula_logo,width=200)
  with head2:
    st.write('mail@tula.co')
    st.write('www.tula.co')
  st.title("CV parsing with Chat GPT")
  PDFFileName = ''

  if not "initialized" in st.session_state:
    st.session_state.isbutton = False
    st.session_state.initialized = True
  
  uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])

  nltk.download('punkt')
  nltk.download('averaged_perceptron_tagger')

  st.subheader("CV examples")
  col1, col2, col3 = st.columns(3)
  with col1:
    ex=load_image(examples[0])
    st.image(ex,width=100)
    if st.button('Example 1'):
        ex=load_image(examples[0])
        img = ex.convert('RGB')
        loader = UnstructuredPDFLoader(img)
        img.save('CV.pdf')
        st.session_state.isbutton=True
        PDFFileName=examples_pdf[0]

  with col2:
    ex1=load_image(examples[1])
    st.image(ex1,width=100)
    if st.button('Example 2'):
        st.session_state.isbutton=True
        PDFFileName = examples_pdf[1]

  with col3:
    ex2=load_image(examples[2])
    st.image(ex2,width=100)
    if st.button('Example 3'):
        st.session_state.isbutton=True
        PDFFileName = examples_pdf[2]

  if (uploaded_file is not None) and (st.session_state.isbutton==False):
    file_name, file_extension = os.path.splitext(uploaded_file.name)

    if file_extension != '.pdf':
        uploaded_image = Image.open(uploaded_file)
        img = uploaded_image.convert('RGB')
        loader = UnstructuredPDFLoader(img)
        img.save(file_name+'.pdf')
        PDFFileName = file_name+'.pdf'
    else:
        with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
            f.write(uploaded_file.getbuffer())
            PDFFileName = f.name

  if PDFFileName != '':
    pdf = pdfium.PdfDocument(PDFFileName)
    n_pages = len(pdf)
    for page_number in range(n_pages):
      page = pdf.get_page(page_number)
      pil_image = page.render(scale=4).to_pil()
      st.image(pil_image,width=700)

    with st.spinner('Document parsing in progress ...'):
      loader = UnstructuredPDFLoader(PDFFileName)
      pages = loader.load_and_split()
      embeddings = OpenAIEmbeddings()
      docsearch = Chroma.from_documents(pages, embeddings).as_retriever()

      current_date = datetime.now()
      query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
      docs = docsearch.get_relevant_documents(query)
      chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
      output = chain.run(input_documents=docs, question=query)
      st.subheader("Parsing result in JSON format")
      valid_json = ast.literal_eval(output)
      st.json(valid_json)

      json_data = json.loads(json.dumps(valid_json))

      names = [json_data.get("full_name", "N/A")]
      contacts = [json_data.get("contacts", "N/A")]
      ages = [json_data.get("age", "N/A")]
      languages = [json_data.get("languages", "N/A")]
      education = [json_data.get("education", "N/A")]
      school = [json_data.get("school", "N/A")]
      works = [json_data.get("work_experience", "N/A")]
      skills = [json_data.get("skills", "N/A")]

      df = pd.DataFrame({
        "name": names,
        "contacts": contacts,
        "age": ages,
        "languages": languages,
        "education": education,
        "school": school,
        "work": works,
        "skill": skills
      })
      st.subheader("Parsing result as a table")
      st.table(df)
      csv = df.to_csv(index=False).encode('utf-8')
      download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
      PDFFileName = ''
      uploaded_file = None
      st.success("Ready!")


if __name__ == "__main__":
    main()