Spaces:
Sleeping
Sleeping
DilshanKavinda
commited on
Commit
•
5886c55
1
Parent(s):
00033f7
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pdfplumber
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
6 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
7 |
+
|
8 |
+
def extract_information_from_cv(pdf_content):
|
9 |
+
with pdfplumber.open(pdf_content) as pdf:
|
10 |
+
text = ""
|
11 |
+
for page in pdf.pages:
|
12 |
+
text += page.extract_text()
|
13 |
+
|
14 |
+
return text
|
15 |
+
|
16 |
+
def extract_title(text):
|
17 |
+
# Extract the title section
|
18 |
+
Title_start = text.find("Title:")
|
19 |
+
Title_end = text.find("Name:")
|
20 |
+
|
21 |
+
return text[Title_start + len("Title:"):Title_end].strip() if Title_start != -1 and Title_end != -1 else None
|
22 |
+
|
23 |
+
|
24 |
+
def extract_name(text):
|
25 |
+
# Extract the name section
|
26 |
+
Name_start = text.find("Name:")
|
27 |
+
Name_end = text.find("Email:")
|
28 |
+
|
29 |
+
return text[Name_start + len("Name:"):Name_end].strip() if Name_start != -1 and Name_end != -1 else None
|
30 |
+
|
31 |
+
|
32 |
+
def extract_Email(text):
|
33 |
+
# Extract the Email section
|
34 |
+
Email_start = text.find("Email:")
|
35 |
+
Email_end = text.find("Phone:")
|
36 |
+
|
37 |
+
return text[Email_start + len("Email:"):Email_end].strip() if Email_start != -1 and Email_end != -1 else None
|
38 |
+
|
39 |
+
|
40 |
+
def extract_Phone(text):
|
41 |
+
# Extract the Phone section
|
42 |
+
Phone_start = text.find("Phone:")
|
43 |
+
Phone_end = text.find("LinkedIn:")
|
44 |
+
|
45 |
+
return text[Phone_start + len("Phone:"):Phone_end].strip() if Phone_start != -1 and Phone_end != -1 else None
|
46 |
+
|
47 |
+
|
48 |
+
def extract_LinkedIn(text):
|
49 |
+
# Extract the LinkedIn section
|
50 |
+
LinkedIn_start = text.find("LinkedIn:")
|
51 |
+
LinkedIn_end = text.find("GitHub:")
|
52 |
+
|
53 |
+
return text[LinkedIn_start + len("LinkedIn:"):LinkedIn_end].strip() if LinkedIn_start != -1 and LinkedIn_end != -1 else None
|
54 |
+
|
55 |
+
|
56 |
+
def extract_Github(text):
|
57 |
+
# Extract the Github section
|
58 |
+
Github_start = text.find("GitHub:")
|
59 |
+
Github_end = text.find("Summary:")
|
60 |
+
|
61 |
+
return text[Github_start + len("GitHub:"):Github_end].strip() if Github_start != -1 and Github_end != -1 else None
|
62 |
+
|
63 |
+
|
64 |
+
def extract_summary(text):
|
65 |
+
summary_start = text.find("Summary:")
|
66 |
+
summary_end = text.find("Education:")
|
67 |
+
|
68 |
+
return text[summary_start + len("Summary:"):summary_end].strip() if summary_start != -1 and summary_end != -1 else None
|
69 |
+
|
70 |
+
|
71 |
+
def extract_education(text):
|
72 |
+
education_start = text.find("Education:")
|
73 |
+
education_end = text.find("Internship:")
|
74 |
+
|
75 |
+
return text[education_start + len("Education:"):education_end].strip() if education_start != -1 and education_end != -1 else None
|
76 |
+
|
77 |
+
|
78 |
+
def extract_Internship(text):
|
79 |
+
Internship_start = text.find("Internship:")
|
80 |
+
Internship_end = text.find("Professional Experience:")
|
81 |
+
|
82 |
+
return text[Internship_start + len("Internship:"):Internship_end].strip() if Internship_start != -1 and Internship_end != -1 else None
|
83 |
+
|
84 |
+
def extract_experience(text):
|
85 |
+
exp_start = text.find("Professional Experience:")
|
86 |
+
exp_end = text.find("Projects:")
|
87 |
+
|
88 |
+
return text[exp_start + len("Professional Experience:"):exp_end].strip() if exp_start != -1 and exp_end != -1 else None
|
89 |
+
|
90 |
+
|
91 |
+
def extract_projects(text):
|
92 |
+
projects_start = text.find("Projects:")
|
93 |
+
projects_end = text.find("Awards and Certifications:")
|
94 |
+
|
95 |
+
return text[projects_start + len("Projects:"):projects_end].strip() if projects_start != -1 and projects_end != -1 else None
|
96 |
+
|
97 |
+
|
98 |
+
def extract_certifications(text):
|
99 |
+
certifications_start = text.find("Awards and Certifications:")
|
100 |
+
certifications_end = text.find("Skills:")
|
101 |
+
|
102 |
+
return text[certifications_start + len("Awards and Certifications:"):certifications_end].strip() if certifications_start != -1 and certifications_end != -1 else None
|
103 |
+
|
104 |
+
|
105 |
+
def extract_skills(text):
|
106 |
+
skills_start = text.find("Skills:")
|
107 |
+
|
108 |
+
return text[skills_start + len("Skills:"):].strip() if skills_start != -1 else None
|
109 |
+
|
110 |
+
|
111 |
+
def main():
|
112 |
+
st.title("CV Shortlisting App")
|
113 |
+
job_description = st.text_area('Job description')
|
114 |
+
uploaded_files = st.file_uploader("Choose multiple CV files", type="pdf", accept_multiple_files=True)
|
115 |
+
options = [i+1 for i in range(len(uploaded_files))]
|
116 |
+
no_of_candidates = st.selectbox('No of candidates need:', options)
|
117 |
+
if no_of_candidates:
|
118 |
+
extract_button = st.button("Extract Data")
|
119 |
+
|
120 |
+
extracted_data = []
|
121 |
+
cv_data = []
|
122 |
+
|
123 |
+
|
124 |
+
if uploaded_files and extract_button:
|
125 |
+
for uploaded_file in uploaded_files:
|
126 |
+
cv_text = extract_information_from_cv(uploaded_file)
|
127 |
+
|
128 |
+
cv_data.append(cv_text)
|
129 |
+
|
130 |
+
title = extract_title(cv_text)
|
131 |
+
name = extract_name(cv_text)
|
132 |
+
phone = extract_Phone(cv_text)
|
133 |
+
email = extract_Email(cv_text)
|
134 |
+
linkedin = extract_LinkedIn(cv_text)
|
135 |
+
github = extract_Github(cv_text)
|
136 |
+
summary = extract_summary(cv_text)
|
137 |
+
education = extract_education(cv_text)
|
138 |
+
internship = extract_Internship(cv_text)
|
139 |
+
experience = extract_experience(cv_text)
|
140 |
+
projects = extract_projects(cv_text)
|
141 |
+
certifications = extract_certifications(cv_text)
|
142 |
+
skills = extract_skills(cv_text)
|
143 |
+
|
144 |
+
data = {
|
145 |
+
"Title": [title],
|
146 |
+
"Name": [name],
|
147 |
+
"Email": [email],
|
148 |
+
"Phone": [phone],
|
149 |
+
"LinkedIn": [linkedin],
|
150 |
+
"Github": [github],
|
151 |
+
"Summary": [summary],
|
152 |
+
"Education": [education],
|
153 |
+
"Internships":[internship],
|
154 |
+
"Professional Experience": [experience],
|
155 |
+
"Projects": [projects],
|
156 |
+
"Awards and Certifications":[certifications],
|
157 |
+
"Skills": [skills]
|
158 |
+
}
|
159 |
+
|
160 |
+
extracted_data.append(data)
|
161 |
+
|
162 |
+
# Two lists of sentences
|
163 |
+
sentences1 = job_description
|
164 |
+
|
165 |
+
sentences2 = cv_data
|
166 |
+
|
167 |
+
#Compute embedding for both lists
|
168 |
+
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
169 |
+
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
|
170 |
+
|
171 |
+
#Compute cosine-similarities
|
172 |
+
cosine_scores = util.cos_sim(embeddings1, embeddings2)
|
173 |
+
|
174 |
+
Scores = []
|
175 |
+
|
176 |
+
#Output the pairs with their score
|
177 |
+
for i in range(len(sentences2)):
|
178 |
+
score = cosine_scores[0][i]
|
179 |
+
Scores.append(score)
|
180 |
+
|
181 |
+
st.write("### Extracted Data:")
|
182 |
+
final_df = pd.DataFrame(extracted_data)
|
183 |
+
final_df['Score'] = Scores
|
184 |
+
df_sorted = final_df.sort_values(by='Score', ascending=False)
|
185 |
+
|
186 |
+
# Extract information for the top students
|
187 |
+
top_cvs = df_sorted.head(no_of_candidates)
|
188 |
+
|
189 |
+
top_cv_list = []
|
190 |
+
top_emails = top_cvs['Email'].values
|
191 |
+
|
192 |
+
|
193 |
+
for email in top_emails:
|
194 |
+
for cv in cv_data:
|
195 |
+
if email[0] in cv:
|
196 |
+
top_cv_list.append(cv)
|
197 |
+
|
198 |
+
st.write(df_sorted)
|
199 |
+
|
200 |
+
st.subheader(f"Top {no_of_candidates} Candidates's cv")
|
201 |
+
st.write(top_cv_list)
|
202 |
+
|
203 |
+
if __name__ == "__main__":
|
204 |
+
main()
|