ddovidovich commited on
Commit
9b70b81
1 Parent(s): de0702c

version 0.2

Browse files

Added:
1) tula logo
2) cv samples

Files changed (4) hide show
  1. app.py +103 -46
  2. cv_melanie.jpg +0 -0
  3. cv_patrik.jpg +0 -0
  4. tulaco.png +0 -0
app.py CHANGED
@@ -19,17 +19,66 @@ from datetime import datetime
19
  from tempfile import NamedTemporaryFile
20
  import pypdfium2 as pdfium
21
 
22
- st.subheader("Upload CV in PDF or image format")
23
- uploaded_file = st.file_uploader("Upload PDF or Images", type=["pdf","png","jpg","jpeg"])
24
 
25
- nltk.download('punkt')
26
- nltk.download('averaged_perceptron_tagger')
27
 
28
- if uploaded_file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  file_name, file_extension = os.path.splitext(uploaded_file.name)
 
30
  if file_extension != '.pdf':
31
  uploaded_image = Image.open(uploaded_file)
32
- st.image(uploaded_image,width=700)
33
  img = uploaded_image.convert('RGB')
34
  loader = UnstructuredPDFLoader(img)
35
  img.save(file_name+'.pdf')
@@ -38,40 +87,42 @@ if uploaded_file:
38
  with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
39
  f.write(uploaded_file.getbuffer())
40
  PDFFileName = f.name
41
- pdf = pdfium.PdfDocument(PDFFileName)
42
- n_pages = len(pdf)
43
- for page_number in range(n_pages):
44
- page = pdf.get_page(page_number)
45
- pil_image = page.render(scale=4).to_pil()
46
- st.image(pil_image,width=700)
47
-
48
- st.write("Document parsing in progress ...")
49
- loader = UnstructuredPDFLoader(PDFFileName)
50
- pages = loader.load_and_split()
51
- embeddings = OpenAIEmbeddings()
52
- docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
53
-
54
- current_date = datetime.now()
55
- query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, places of work, skills.If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
56
- docs = docsearch.get_relevant_documents(query)
57
- chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
58
- output = chain.run(input_documents=docs, question=query)
59
- st.subheader("Parsing result in JSON format")
60
- valid_json = ast.literal_eval(output)
61
- st.json(valid_json)
62
-
63
- json_data = json.loads(json.dumps(valid_json))
64
-
65
- names = [json_data.get("full_name", "N/A")]
66
- contacts = [json_data.get("contacts", "N/A")]
67
- ages = [json_data.get("age", "N/A")]
68
- languages = [json_data.get("languages", "N/A")]
69
- education = [json_data.get("education", "N/A")]
70
- school = [json_data.get("school", "N/A")]
71
- works = [json_data.get("places_of_work", "N/A")]
72
- skills = [json_data.get("skills", "N/A")]
73
-
74
- df = pd.DataFrame({
 
 
75
  "name": names,
76
  "contacts": contacts,
77
  "age": ages,
@@ -80,9 +131,15 @@ if uploaded_file:
80
  "school": school,
81
  "work": works,
82
  "skill": skills
83
- })
84
- st.subheader("Parsing result as a table")
85
- st.table(df)
86
- csv = df.to_csv(index=False).encode('utf-8')
87
- download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
88
- st.write("Done...")
 
 
 
 
 
 
 
19
  from tempfile import NamedTemporaryFile
20
  import pypdfium2 as pdfium
21
 
22
+ examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
23
+ examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]
24
 
 
 
25
 
26
+ def load_image(image_file):
27
+ img = Image.open(image_file)
28
+ return img
29
+
30
+ def main():
31
+ head1, head2 = st.columns(2)
32
+ with head1:
33
+ tula_logo=load_image('tulaco.png')
34
+ st.image(tula_logo,width=200)
35
+ with head2:
36
+ st.write('mail@tula.co')
37
+ st.write('www.tula.co')
38
+ st.title("CV parsing with Chat GPT")
39
+ PDFFileName = ''
40
+
41
+ if not "initialized" in st.session_state:
42
+ st.session_state.isbutton = False
43
+ st.session_state.initialized = True
44
+
45
+ uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])
46
+
47
+ nltk.download('punkt')
48
+ nltk.download('averaged_perceptron_tagger')
49
+
50
+ st.subheader("CV examples")
51
+ col1, col2, col3 = st.columns(3)
52
+ with col1:
53
+ ex=load_image(examples[0])
54
+ st.image(ex,width=100)
55
+ if st.button('Example 1'):
56
+ ex=load_image(examples[0])
57
+ img = ex.convert('RGB')
58
+ loader = UnstructuredPDFLoader(img)
59
+ img.save('CV.pdf')
60
+ st.session_state.isbutton=True
61
+ PDFFileName=examples_pdf[0]
62
+
63
+ with col2:
64
+ ex1=load_image(examples[1])
65
+ st.image(ex1,width=100)
66
+ if st.button('Example 2'):
67
+ st.session_state.isbutton=True
68
+ PDFFileName = examples_pdf[1]
69
+
70
+ with col3:
71
+ ex2=load_image(examples[2])
72
+ st.image(ex2,width=100)
73
+ if st.button('Example 3'):
74
+ st.session_state.isbutton=True
75
+ PDFFileName = examples_pdf[2]
76
+
77
+ if (uploaded_file is not None) and (st.session_state.isbutton==False):
78
  file_name, file_extension = os.path.splitext(uploaded_file.name)
79
+
80
  if file_extension != '.pdf':
81
  uploaded_image = Image.open(uploaded_file)
 
82
  img = uploaded_image.convert('RGB')
83
  loader = UnstructuredPDFLoader(img)
84
  img.save(file_name+'.pdf')
 
87
  with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
88
  f.write(uploaded_file.getbuffer())
89
  PDFFileName = f.name
90
+
91
+ if PDFFileName != '':
92
+ pdf = pdfium.PdfDocument(PDFFileName)
93
+ n_pages = len(pdf)
94
+ for page_number in range(n_pages):
95
+ page = pdf.get_page(page_number)
96
+ pil_image = page.render(scale=4).to_pil()
97
+ st.image(pil_image,width=700)
98
+
99
+ with st.spinner('Document parsing in progress ...'):
100
+ loader = UnstructuredPDFLoader(PDFFileName)
101
+ pages = loader.load_and_split()
102
+ embeddings = OpenAIEmbeddings()
103
+ docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
104
+
105
+ current_date = datetime.now()
106
+ query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
107
+ docs = docsearch.get_relevant_documents(query)
108
+ chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
109
+ output = chain.run(input_documents=docs, question=query)
110
+ st.subheader("Parsing result in JSON format")
111
+ valid_json = ast.literal_eval(output)
112
+ st.json(valid_json)
113
+
114
+ json_data = json.loads(json.dumps(valid_json))
115
+
116
+ names = [json_data.get("full_name", "N/A")]
117
+ contacts = [json_data.get("contacts", "N/A")]
118
+ ages = [json_data.get("age", "N/A")]
119
+ languages = [json_data.get("languages", "N/A")]
120
+ education = [json_data.get("education", "N/A")]
121
+ school = [json_data.get("school", "N/A")]
122
+ works = [json_data.get("work_experience", "N/A")]
123
+ skills = [json_data.get("skills", "N/A")]
124
+
125
+ df = pd.DataFrame({
126
  "name": names,
127
  "contacts": contacts,
128
  "age": ages,
 
131
  "school": school,
132
  "work": works,
133
  "skill": skills
134
+ })
135
+ st.subheader("Parsing result as a table")
136
+ st.table(df)
137
+ csv = df.to_csv(index=False).encode('utf-8')
138
+ download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
139
+ PDFFileName = ''
140
+ uploaded_file = None
141
+ st.success("Ready!")
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
cv_melanie.jpg ADDED
cv_patrik.jpg ADDED
tulaco.png ADDED