File size: 5,143 Bytes
82c7530
22b0051
 
 
ac438eb
cdf20ae
de0702c
2e95f21
eac2263
 
22b0051
 
 
 
 
 
 
 
 
75539bd
eac2263
9b70b81
 
22b0051
797f3b1
9b70b81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22b0051
9b70b81
22b0051
 
 
 
 
 
 
18baae6
22b0051
 
9b70b81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae3ab6
e581254
5ae3ab6
 
 
 
 
 
9b70b81
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import streamlit as st
import os 
import nltk
import json
import base64
import pandas as pd
import ast

os.environ["OPENAI_API_KEY"] = os.getenv("SECRET_KEY")

from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain   
from PIL import Image
from datetime import datetime
from tempfile import NamedTemporaryFile
import pypdfium2 as pdfium
    
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]


def load_image(image_file):
	img = Image.open(image_file)
	return img

def main():
  head1, head2 = st.columns(2)
  with head1:
    tula_logo=load_image('tulaco.png')
    st.image(tula_logo,width=200)
  with head2:
    st.write('mail@tula.co')
    st.write('www.tula.co')
  st.title("CV parsing with Chat GPT")
  PDFFileName = ''

  if not "initialized" in st.session_state:
    st.session_state.isbutton = False
    st.session_state.initialized = True
  
  uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])

  nltk.download('punkt')
  nltk.download('averaged_perceptron_tagger')

  st.subheader("CV examples")
  col1, col2, col3 = st.columns(3)
  with col1:
    ex=load_image(examples[0])
    st.image(ex,width=100)
    if st.button('Example 1'):
        ex=load_image(examples[0])
        img = ex.convert('RGB')
        loader = UnstructuredPDFLoader(img)
        img.save('CV.pdf')
        st.session_state.isbutton=True
        PDFFileName=examples_pdf[0]

  with col2:
    ex1=load_image(examples[1])
    st.image(ex1,width=100)
    if st.button('Example 2'):
        st.session_state.isbutton=True
        PDFFileName = examples_pdf[1]

  with col3:
    ex2=load_image(examples[2])
    st.image(ex2,width=100)
    if st.button('Example 3'):
        st.session_state.isbutton=True
        PDFFileName = examples_pdf[2]

  if (uploaded_file is not None) and (st.session_state.isbutton==False):
    file_name, file_extension = os.path.splitext(uploaded_file.name)

    if file_extension != '.pdf':
        uploaded_image = Image.open(uploaded_file)
        img = uploaded_image.convert('RGB')
        loader = UnstructuredPDFLoader(img)
        img.save(file_name+'.pdf')
        PDFFileName = file_name+'.pdf'
    else:
        with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
            f.write(uploaded_file.getbuffer())
            PDFFileName = f.name

  if PDFFileName != '':
    pdf = pdfium.PdfDocument(PDFFileName)
    n_pages = len(pdf)
    for page_number in range(n_pages):
      page = pdf.get_page(page_number)
      pil_image = page.render(scale=4).to_pil()
      st.image(pil_image,width=700)

    with st.spinner('Document parsing in progress ...'):
      loader = UnstructuredPDFLoader(PDFFileName)
      pages = loader.load_and_split()
      embeddings = OpenAIEmbeddings()
      docsearch = Chroma.from_documents(pages, embeddings).as_retriever()

      current_date = datetime.now()
      query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
      docs = docsearch.get_relevant_documents(query)
      chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
      output = chain.run(input_documents=docs, question=query)
      st.subheader("Parsing result in JSON format")
      valid_json = ast.literal_eval(output)
      st.json(valid_json)

      json_data = json.loads(json.dumps(valid_json))

      names = [json_data.get("full_name", "N/A")]
      contacts = [json_data.get("contacts", "N/A")]
      ages = [json_data.get("age", "N/A")]
      languages = [json_data.get("languages", "N/A")]
      education = [json_data.get("education", "N/A")]
      school = [json_data.get("school", "N/A")]
      works = [json_data.get("work_experience", "N/A")]
      skills = [json_data.get("skills", "N/A")]

      df = pd.DataFrame({
        "name": names,
        "contacts": contacts,
        "age": ages,
        "languages": languages,
        "education": education,
        "school": school,
        "work": works,
        "skill": skills
      })
      st.subheader("Parsing result as a table")
      st.table(df)
      csv = df.to_csv(index=False).encode('utf-8')
      download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
      PDFFileName = ''
      uploaded_file = None
      st.success("Ready!")


if __name__ == "__main__":
    main()