Spaces:
Runtime error
Runtime error
ddovidovich
commited on
Commit
•
9b70b81
1
Parent(s):
de0702c
version 0.2
Browse filesAdded:
1) tula logo
2) cv samples
- app.py +103 -46
- cv_melanie.jpg +0 -0
- cv_patrik.jpg +0 -0
- tulaco.png +0 -0
app.py
CHANGED
@@ -19,17 +19,66 @@ from datetime import datetime
|
|
19 |
from tempfile import NamedTemporaryFile
|
20 |
import pypdfium2 as pdfium
|
21 |
|
22 |
-
|
23 |
-
|
24 |
|
25 |
-
nltk.download('punkt')
|
26 |
-
nltk.download('averaged_perceptron_tagger')
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
file_name, file_extension = os.path.splitext(uploaded_file.name)
|
|
|
30 |
if file_extension != '.pdf':
|
31 |
uploaded_image = Image.open(uploaded_file)
|
32 |
-
st.image(uploaded_image,width=700)
|
33 |
img = uploaded_image.convert('RGB')
|
34 |
loader = UnstructuredPDFLoader(img)
|
35 |
img.save(file_name+'.pdf')
|
@@ -38,40 +87,42 @@ if uploaded_file:
|
|
38 |
with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
|
39 |
f.write(uploaded_file.getbuffer())
|
40 |
PDFFileName = f.name
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
75 |
"name": names,
|
76 |
"contacts": contacts,
|
77 |
"age": ages,
|
@@ -80,9 +131,15 @@ if uploaded_file:
|
|
80 |
"school": school,
|
81 |
"work": works,
|
82 |
"skill": skills
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
from tempfile import NamedTemporaryFile
|
20 |
import pypdfium2 as pdfium
|
21 |
|
22 |
+
examples=["CV.png","cv_patrik.jpg","cv_melanie.jpg"]
|
23 |
+
examples_pdf=["CV.pdf","CV_Patrik.pdf","CV_Melanie.pdf"]
|
24 |
|
|
|
|
|
25 |
|
26 |
+
def load_image(image_file):
|
27 |
+
img = Image.open(image_file)
|
28 |
+
return img
|
29 |
+
|
30 |
+
def main():
|
31 |
+
head1, head2 = st.columns(2)
|
32 |
+
with head1:
|
33 |
+
tula_logo=load_image('tulaco.png')
|
34 |
+
st.image(tula_logo,width=200)
|
35 |
+
with head2:
|
36 |
+
st.write('mail@tula.co')
|
37 |
+
st.write('www.tula.co')
|
38 |
+
st.title("CV parsing with Chat GPT")
|
39 |
+
PDFFileName = ''
|
40 |
+
|
41 |
+
if not "initialized" in st.session_state:
|
42 |
+
st.session_state.isbutton = False
|
43 |
+
st.session_state.initialized = True
|
44 |
+
|
45 |
+
uploaded_file = st.file_uploader("Upload CV in PDF or image format", type=["pdf","png","jpg","jpeg"])
|
46 |
+
|
47 |
+
nltk.download('punkt')
|
48 |
+
nltk.download('averaged_perceptron_tagger')
|
49 |
+
|
50 |
+
st.subheader("CV examples")
|
51 |
+
col1, col2, col3 = st.columns(3)
|
52 |
+
with col1:
|
53 |
+
ex=load_image(examples[0])
|
54 |
+
st.image(ex,width=100)
|
55 |
+
if st.button('Example 1'):
|
56 |
+
ex=load_image(examples[0])
|
57 |
+
img = ex.convert('RGB')
|
58 |
+
loader = UnstructuredPDFLoader(img)
|
59 |
+
img.save('CV.pdf')
|
60 |
+
st.session_state.isbutton=True
|
61 |
+
PDFFileName=examples_pdf[0]
|
62 |
+
|
63 |
+
with col2:
|
64 |
+
ex1=load_image(examples[1])
|
65 |
+
st.image(ex1,width=100)
|
66 |
+
if st.button('Example 2'):
|
67 |
+
st.session_state.isbutton=True
|
68 |
+
PDFFileName = examples_pdf[1]
|
69 |
+
|
70 |
+
with col3:
|
71 |
+
ex2=load_image(examples[2])
|
72 |
+
st.image(ex2,width=100)
|
73 |
+
if st.button('Example 3'):
|
74 |
+
st.session_state.isbutton=True
|
75 |
+
PDFFileName = examples_pdf[2]
|
76 |
+
|
77 |
+
if (uploaded_file is not None) and (st.session_state.isbutton==False):
|
78 |
file_name, file_extension = os.path.splitext(uploaded_file.name)
|
79 |
+
|
80 |
if file_extension != '.pdf':
|
81 |
uploaded_image = Image.open(uploaded_file)
|
|
|
82 |
img = uploaded_image.convert('RGB')
|
83 |
loader = UnstructuredPDFLoader(img)
|
84 |
img.save(file_name+'.pdf')
|
|
|
87 |
with NamedTemporaryFile(delete=False, dir='.', suffix='.pdf') as f:
|
88 |
f.write(uploaded_file.getbuffer())
|
89 |
PDFFileName = f.name
|
90 |
+
|
91 |
+
if PDFFileName != '':
|
92 |
+
pdf = pdfium.PdfDocument(PDFFileName)
|
93 |
+
n_pages = len(pdf)
|
94 |
+
for page_number in range(n_pages):
|
95 |
+
page = pdf.get_page(page_number)
|
96 |
+
pil_image = page.render(scale=4).to_pil()
|
97 |
+
st.image(pil_image,width=700)
|
98 |
+
|
99 |
+
with st.spinner('Document parsing in progress ...'):
|
100 |
+
loader = UnstructuredPDFLoader(PDFFileName)
|
101 |
+
pages = loader.load_and_split()
|
102 |
+
embeddings = OpenAIEmbeddings()
|
103 |
+
docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
|
104 |
+
|
105 |
+
current_date = datetime.now()
|
106 |
+
query = "Output informatio, (all in English), from the document in JSON format: full name, contacts, age, languages, education, school, work experience, skills. If some fields cannot be filled from the document, then create this field and fill it with N/A. If the date of birth is not indicated, then please calculate the approximate age of the candidate based on the information provided in the document, for calculations, take into account that graduation from the university is usually at 22 years old. Current date = "+ current_date.date().strftime('%Y-%m-%d')
|
107 |
+
docs = docsearch.get_relevant_documents(query)
|
108 |
+
chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
|
109 |
+
output = chain.run(input_documents=docs, question=query)
|
110 |
+
st.subheader("Parsing result in JSON format")
|
111 |
+
valid_json = ast.literal_eval(output)
|
112 |
+
st.json(valid_json)
|
113 |
+
|
114 |
+
json_data = json.loads(json.dumps(valid_json))
|
115 |
+
|
116 |
+
names = [json_data.get("full_name", "N/A")]
|
117 |
+
contacts = [json_data.get("contacts", "N/A")]
|
118 |
+
ages = [json_data.get("age", "N/A")]
|
119 |
+
languages = [json_data.get("languages", "N/A")]
|
120 |
+
education = [json_data.get("education", "N/A")]
|
121 |
+
school = [json_data.get("school", "N/A")]
|
122 |
+
works = [json_data.get("work_experience", "N/A")]
|
123 |
+
skills = [json_data.get("skills", "N/A")]
|
124 |
+
|
125 |
+
df = pd.DataFrame({
|
126 |
"name": names,
|
127 |
"contacts": contacts,
|
128 |
"age": ages,
|
|
|
131 |
"school": school,
|
132 |
"work": works,
|
133 |
"skill": skills
|
134 |
+
})
|
135 |
+
st.subheader("Parsing result as a table")
|
136 |
+
st.table(df)
|
137 |
+
csv = df.to_csv(index=False).encode('utf-8')
|
138 |
+
download1 = st.download_button(label="Download result as CSV",data=csv,file_name='result_df.csv',mime='text/csv')
|
139 |
+
PDFFileName = ''
|
140 |
+
uploaded_file = None
|
141 |
+
st.success("Ready!")
|
142 |
+
|
143 |
+
|
144 |
+
if __name__ == "__main__":
|
145 |
+
main()
|
cv_melanie.jpg
ADDED
cv_patrik.jpg
ADDED
tulaco.png
ADDED