asimokby commited on
Commit
cfbfc47
1 Parent(s): 5e712eb

create cv parser app

Browse files
Files changed (6) hide show
  1. Main.py +23 -0
  2. Models.py +58 -0
  3. ResumeParser.py +237 -0
  4. ResumeReader.py +100 -0
  5. ResumeSegmenter.py +259 -0
  6. app.py +18 -0
Main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ResumeReader import ResumeReader
2
+ from ResumeParser import ResumeParser
3
+ from Models import Models
4
+ import json
5
+ import os
6
+
7
+
8
+ class Main:
9
+ def __init__(self):
10
+ models = Models()
11
+ ner, ner_dates, zero_shot_classifier, tagger = models.load_trained_models()
12
+ self.reader = ResumeReader()
13
+ self.parser = ResumeParser(ner, ner_dates, zero_shot_classifier, tagger)
14
+
15
+ def parse_cv(self, file_path):
16
+ resume_lines = self.reader.read_file(file_path)
17
+ output = self.parser.parse(resume_lines)
18
+ return output
19
+
20
+ def save_parse_as_json(self, dict, file_name):
21
+ print("Saving the parse...")
22
+ with open(file_name, 'w', encoding="utf-8") as f:
23
+ json.dump(dict, f, indent=4, default=str, ensure_ascii=False)
Models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
2
+ from transformers import pipeline
3
+ from flair.data import Sentence
4
+ from flair.models import SequenceTagger
5
+ import pickle
6
+
7
+
8
+
9
+ class Models:
10
+
11
+ def pickle_it(self, obj, file_name):
12
+ with open(f'{file_name}.pickle', 'wb') as f:
13
+ pickle.dump(obj, f)
14
+
15
+ def unpickle_it(self, file_name):
16
+ with open(f'{file_name}.pickle', 'rb') as f:
17
+ return pickle.load(f)
18
+
19
+ def load_trained_models(self, pickle=False):
20
+ #NER (dates)
21
+ tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
22
+ model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
23
+ self.ner_dates = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
24
+
25
+ #Zero Shot Classification
26
+ self.zero_shot_classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli')
27
+ # self.zero_shot_classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-6')
28
+
29
+ # Ner
30
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
31
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
32
+ self.ner = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
33
+
34
+ # Pos Tagging
35
+ self.tagger = SequenceTagger.load("flair/pos-english-fast")
36
+
37
+
38
+ if pickle:
39
+ self.pickle_models()
40
+
41
+ return self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger
42
+
43
+ def pickle_models(self):
44
+ self.pickle_it(self.ner, "ner")
45
+ self.pickle_it(self.zero_shot_classifier, "zero_shot_classifier_6")
46
+ self.pickle_it(self.ner_dates, "ner_dates")
47
+ self.pickle_it(self.tagger, "pos_tagger_fast")
48
+
49
+
50
+ def load_pickled_models(self):
51
+ ner_dates = self.unpickle_it('ner_dates')
52
+ ner = self.unpickle_it('ner')
53
+ zero_shot_classifier = self.unpickle_it('zero_shot_classifier_6')
54
+ tagger = self.unpickle_it("pos_tagger_fast")
55
+ return ner_dates, ner, zero_shot_classifier, tagger
56
+
57
+ def get_flair_sentence(self, sent):
58
+ return Sentence(sent)
ResumeParser.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+ from ResumeSegmenter import ResumeSegmenter
3
+ from datetime import datetime
4
+ from dateutil import parser
5
+ import re
6
+ from string import punctuation
7
+
8
+ class ResumeParser:
9
+ def __init__(self, ner, ner_dates, zero_shot_classifier, tagger):
10
+ self.models = Models()
11
+ self.segmenter = ResumeSegmenter(zero_shot_classifier)
12
+ self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger = ner, ner_dates, zero_shot_classifier, tagger
13
+ self.parsed_cv = {}
14
+
15
+ def parse(self, resume_lines):
16
+ resume_segments = self.segmenter.segment(resume_lines)
17
+ print("Parsing the Resume...")
18
+ for segment_name in resume_segments:
19
+ if segment_name == "contact_info":
20
+ contact_info = resume_segments[segment_name]
21
+ self.parse_contact_info(contact_info)
22
+ elif segment_name == "work_and_employment":
23
+ resume_segment = resume_segments[segment_name]
24
+ self.parse_job_history(resume_segment)
25
+ return self.parsed_cv
26
+
27
+
28
+ def parse_contact_info(self, contact_info):
29
+ contact_info_dict = {}
30
+ name = self.find_person_name(contact_info)
31
+ email = self.find_contact_email(contact_info)
32
+ self.parsed_cv['Name'] = name
33
+ contact_info_dict["Email"] = email
34
+ self.parsed_cv['Contact Info'] = contact_info_dict
35
+
36
+ def find_person_name(self, items):
37
+ class_score = []
38
+ splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
39
+ classes = ["person name", "address", "email", "title"]
40
+ for item in items:
41
+ elements = splitter.split(item)
42
+ for element in elements:
43
+ element = ''.join(i for i in element.strip() if not i.isdigit())
44
+ if not len(element.strip().split()) > 1: continue
45
+ out = self.zero_shot_classifier(element, classes)
46
+ highest = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1])[-1]
47
+ if highest[0] == "person name":
48
+ class_score.append((element, highest[1]))
49
+ if len(class_score):
50
+ return sorted(class_score, key=lambda x: x[1], reverse=True)[0][0]
51
+ return ""
52
+
53
+ def find_contact_email(self, items):
54
+ for item in items:
55
+ match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', item)
56
+ if match:
57
+ return match.group(0)
58
+ return ""
59
+
60
+ def parse_job_history(self, resume_segment):
61
+ idx_job_title = self.get_job_titles(resume_segment)
62
+ current_and_below = False
63
+ if not len(idx_job_title):
64
+ self.parsed_cv["Job History"] = []
65
+ return
66
+ if idx_job_title[0][0] == 0: current_and_below = True
67
+ job_history = []
68
+ for ls_idx, (idx, job_title) in enumerate(idx_job_title):
69
+ job_info = {}
70
+ job_info["Job Title"] = self.filter_job_title(job_title)
71
+ # company
72
+ if current_and_below: line1, line2 = idx, idx+1
73
+ else: line1, line2 = idx, idx-1
74
+ job_info["Company"] = self.get_job_company(line1, line2, resume_segment)
75
+ if current_and_below: st_span = idx
76
+ else: st_span = idx-1
77
+ # Dates
78
+ if ls_idx == len(idx_job_title) - 1: end_span = len(resume_segment)
79
+ else: end_span = idx_job_title[ls_idx+1][0]
80
+ start, end = self.get_job_dates(st_span, end_span, resume_segment)
81
+ job_info["Start Date"] = start
82
+ job_info["End Date"] = end
83
+ job_history.append(job_info)
84
+ self.parsed_cv["Job History"] = job_history
85
+
86
+ def get_job_titles(self, resume_segment):
87
+ classes = ["organization", "institution", "job title", "role"]
88
+ idx_line = []
89
+ for idx, line in enumerate(resume_segment):
90
+ has_verb = False
91
+ sentence = self.models.get_flair_sentence(line)
92
+ self.tagger.predict(sentence)
93
+ for entity in sentence.get_spans('pos'):
94
+ if entity.tag.startswith("V"):
95
+ has_verb = True
96
+ break
97
+ if not has_verb:
98
+ out = self.zero_shot_classifier(line, classes)
99
+ class_score = zip(out["labels"], out["scores"])
100
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
101
+
102
+ if highest[0] == "job title":
103
+ idx_line.append((idx, line))
104
+
105
+ return idx_line
106
+
107
+
108
+ def get_job_dates(self, st, end, resume_segment):
109
+ search_span = resume_segment[st:end]
110
+ dates = []
111
+ for line in search_span:
112
+ for dt in self.get_ner_in_line(line, "DATE"):
113
+ if self.isvalidyear(dt.strip()):
114
+ dates.append(dt)
115
+ if len(dates): first = dates[0]
116
+ exists_second = False
117
+ if len(dates) > 1:
118
+ exists_second = True
119
+ second = dates[1]
120
+
121
+ if len(dates) > 0:
122
+ if self.has_two_dates(first):
123
+ d1, d2 = self.get_two_dates(first)
124
+ return self.format_date(d1), self.format_date(d2)
125
+ elif exists_second and self.has_two_dates(second):
126
+ d1, d2 = self.get_two_dates(second)
127
+ return self.format_date(d1), self.format_date(d2)
128
+ else:
129
+ if exists_second:
130
+ st = self.format_date(first)
131
+ end = self.format_date(second)
132
+ return st, end
133
+ else:
134
+ return (self.format_date(first), "")
135
+ else: return ("", "")
136
+
137
+
138
+
139
+ def filter_job_title(self, job_title):
140
+ job_title_splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
141
+ job_title = ''.join(i for i in job_title if not i.isdigit())
142
+ tokens = job_title_splitter.split(job_title)
143
+ tokens = [''.join([i for i in tok.strip() if (i.isalpha() or i.strip()=="")]) for tok in tokens if tok.strip()]
144
+ classes = ["company", "organization", "institution", "job title", "responsibility", "details"]
145
+ new_title = []
146
+ for token in tokens:
147
+ if not token: continue
148
+ res = self.zero_shot_classifier(token, classes)
149
+ class_score = zip(res["labels"], res["scores"])
150
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
151
+ if highest[0] == "job title":
152
+ new_title.append(token.strip())
153
+ if len(new_title):
154
+ return ', '.join(new_title)
155
+ else: return ', '.join(tokens)
156
+
157
+ def has_two_dates(self, date):
158
+ years = self.get_valid_years()
159
+ count = 0
160
+ for year in years:
161
+ if year in str(date):
162
+ count+=1
163
+ return count == 2
164
+
165
+ def get_two_dates(self, date):
166
+ years = self.get_valid_years()
167
+ idxs = []
168
+ for year in years:
169
+ if year in date:
170
+ idxs.append(date.index(year))
171
+ min_idx = min(idxs)
172
+ first = date[:min_idx+4]
173
+ second = date[min_idx+4:]
174
+ return first, second
175
+ def get_valid_years(self):
176
+ current_year = datetime.today().year
177
+ years = [str(i) for i in range(current_year-100, current_year)]
178
+ return years
179
+
180
+ def format_date(self, date):
181
+ out = self.parse_date(date)
182
+ if out:
183
+ return out
184
+ else:
185
+ date = self.clean_date(date)
186
+ out = self.parse_date(date)
187
+ if out:
188
+ return out
189
+ else:
190
+ return date
191
+
192
+ def clean_date(self, date):
193
+ try:
194
+ date = ''.join(i for i in date if i.isalnum() or i =='-' or i == '/')
195
+ return date
196
+ except:
197
+ return date
198
+
199
+ def parse_date(self, date):
200
+ try:
201
+ date = parser.parse(date)
202
+ return date.strftime("%m-%Y")
203
+ except:
204
+ try:
205
+ date = datetime(date)
206
+ return date.strftime("%m-%Y")
207
+ except:
208
+ return 0
209
+
210
+
211
+ def isvalidyear(self, date):
212
+ current_year = datetime.today().year
213
+ years = [str(i) for i in range(current_year-100, current_year)]
214
+ for year in years:
215
+ if year in str(date):
216
+ return True
217
+ return False
218
+
219
+ def get_ner_in_line(self, line, entity_type):
220
+ if entity_type == "DATE": ner = self.ner_dates
221
+ else: ner = self.ner
222
+ return [i['word'] for i in ner(line) if i['entity_group'] == entity_type]
223
+
224
+
225
+ def get_job_company(self, idx, idx1, resume_segment):
226
+ job_title = resume_segment[idx]
227
+ if not idx1 <= len(resume_segment)-1: context = ""
228
+ else:context = resume_segment[idx1]
229
+ candidate_companies = self.get_ner_in_line(job_title, "ORG") + self.get_ner_in_line(context, "ORG")
230
+ classes = ["organization", "company", "institution", "not organization", "not company", "not institution"]
231
+ scores = []
232
+ for comp in candidate_companies:
233
+ res = self.zero_shot_classifier(comp, classes)['scores']
234
+ scores.append(max(res[:3]))
235
+ sorted_cmps = sorted(zip(candidate_companies, scores), key=lambda x: x[1], reverse=True)
236
+ if len(sorted_cmps): return sorted_cmps[0][0]
237
+ return context
ResumeReader.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ import docx
5
+ import pdfplumber
6
+
7
+ class ResumeReader:
8
+
9
+ def convert_docx_to_txt(self, docx_file,docx_parser):
10
+ """
11
+ A utility function to convert a Microsoft docx files to raw text.
12
+
13
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
14
+ :param docx_file: docx file with gets uploaded by the user
15
+ :type docx_file: InMemoryUploadedFile
16
+ :return: The text contents of the docx file
17
+ :rtype: str
18
+ """
19
+
20
+ doc = docx.Document(docx_file)
21
+ allText = []
22
+ for docpara in doc.paragraphs:
23
+ allText.append(docpara.text)
24
+ text = ' '.join(allText)
25
+ try:
26
+ clean_text = re.sub(r'\n+', '\n', text)
27
+ clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
28
+ resume_lines = clean_text.splitlines() # Split text blob into individual lines
29
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
30
+ line.strip()] # Remove empty strings and whitespaces
31
+ return resume_lines, text
32
+ except Exception as e:
33
+ logging.error('Error in docx file:: ' + str(e))
34
+ return [], " "
35
+
36
+ def convert_pdf_to_txt(self, pdf_file):
37
+ """
38
+ A utility function to convert a machine-readable PDF to raw text.
39
+
40
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
41
+ :param input_pdf_path: Path to the .pdf file which should be converted
42
+ :type input_pdf_path: str
43
+ :return: The text contents of the pdf
44
+ :rtype: str
45
+ """
46
+
47
+ pdf = pdfplumber.open(pdf_file)
48
+ raw_text= ""
49
+
50
+ for page in pdf.pages:
51
+ raw_text += page.extract_text() + "\n"
52
+
53
+ pdf.close()
54
+
55
+ try:
56
+ full_string = re.sub(r'\n+', '\n', raw_text)
57
+ full_string = full_string.replace("\r", "\n")
58
+ full_string = full_string.replace("\t", " ")
59
+
60
+ # Remove awkward LaTeX bullet characters
61
+ full_string = re.sub(r"\uf0b7", " ", full_string)
62
+ full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
63
+ full_string = re.sub(r'• ', " ", full_string)
64
+
65
+ # Split text blob into individual lines
66
+ resume_lines = full_string.splitlines(True)
67
+
68
+ # Remove empty strings and whitespaces
69
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
70
+
71
+ return resume_lines, raw_text
72
+ except Exception as e:
73
+ logging.error('Error in docx file:: ' + str(e))
74
+ return [], " "
75
+
76
+ def read_file(self, file,docx_parser = "tika"):
77
+ """
78
+ file : Give path of resume file
79
+ docx_parser : Enter docx2txt or tika, by default is tika
80
+ """
81
+ print("Reading the Resume...")
82
+ # file = "/content/Asst Manager Trust Administration.docx"
83
+ file = os.path.join(file)
84
+ if file.endswith('docx') or file.endswith('doc'):
85
+ # if file.endswith('doc') and docx_parser == "docx2txt":
86
+ # docx_parser = "tika"
87
+ # logging.error("doc format not supported by the docx2txt changing back to tika")
88
+ resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
89
+ elif file.endswith('pdf'):
90
+ resume_lines, raw_text = self.convert_pdf_to_txt(file)
91
+ elif file.endswith('txt'):
92
+ with open(file, 'r', encoding='utf-8') as f:
93
+ resume_lines = f.readlines()
94
+
95
+ else:
96
+ resume_lines = None
97
+
98
+ # print(resume_lines)
99
+
100
+ return resume_lines
ResumeSegmenter.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+
3
+ class ResumeSegmenter:
4
+
5
+ def __init__(self, zero_shot_classifier):
6
+ self.zero_shot_classifier = zero_shot_classifier
7
+
8
+ objective = (
9
+ 'career goal',
10
+ 'objective',
11
+ 'career objective',
12
+ 'employment objective',
13
+ 'professional objective',
14
+ 'summary',
15
+ 'summary of qualifications',
16
+ 'digital'
17
+ )
18
+
19
+ work_and_employment = (
20
+ 'employment history',
21
+ 'employment data',
22
+ 'career summary',
23
+ 'work history',
24
+ 'work experience',
25
+ 'experience',
26
+ 'professional experience',
27
+ 'professional background',
28
+ 'professional employment',
29
+ 'additional experience',
30
+ 'career related experience',
31
+ "professional employment history",
32
+ 'related experience',
33
+ 'programming experience',
34
+ 'freelance',
35
+ 'freelance experience',
36
+ 'army experience',
37
+ 'military experience',
38
+ 'military background',
39
+ )
40
+
41
+ education_and_training = (
42
+ 'academic background',
43
+ 'academic experience',
44
+ 'programs',
45
+ 'courses',
46
+ 'related courses',
47
+ 'education',
48
+ 'educational background',
49
+ 'educational qualifications',
50
+ 'educational training',
51
+ 'education and training',
52
+ 'training',
53
+ 'academic training',
54
+ 'professional training',
55
+ 'course project experience',
56
+ 'related course projects',
57
+ 'internship experience',
58
+ 'internships',
59
+ 'apprenticeships',
60
+ 'college activities',
61
+ 'certifications',
62
+ 'special training',
63
+ )
64
+
65
+ skills_header = (
66
+ 'credentials',
67
+ 'qualifications',
68
+ 'areas of experience',
69
+ 'areas of expertise',
70
+ 'areas of knowledge',
71
+ 'skills',
72
+ "other skills",
73
+ "other abilities",
74
+ 'career related skills',
75
+ 'professional skills',
76
+ 'specialized skills',
77
+ 'technical skills',
78
+ 'computer skills',
79
+ 'personal skills',
80
+ 'computer knowledge',
81
+ 'technologies',
82
+ 'technical experience',
83
+ 'proficiencies',
84
+ 'languages',
85
+ 'language competencies and skills',
86
+ 'programming languages',
87
+ 'competencies'
88
+ )
89
+
90
+ misc = (
91
+ 'activities and honors',
92
+ 'activities',
93
+ 'affiliations',
94
+ 'professional affiliations',
95
+ 'associations',
96
+ 'professional associations',
97
+ 'memberships',
98
+ 'professional memberships',
99
+ 'athletic involvement',
100
+ 'community involvement',
101
+ 'refere',
102
+ 'civic activities',
103
+ 'extra-Curricular activities',
104
+ 'professional activities',
105
+ 'volunteer work',
106
+ 'volunteer experience',
107
+ 'additional information',
108
+ 'interests'
109
+ )
110
+
111
+ accomplishments = (
112
+ 'achievement',
113
+ 'awards and achievements',
114
+ 'licenses',
115
+ 'presentations',
116
+ 'conference presentations',
117
+ 'conventions',
118
+ 'dissertations',
119
+ 'exhibits',
120
+ 'papers',
121
+ 'publications',
122
+ 'professional publications',
123
+ 'research experience',
124
+ 'research grants',
125
+ 'project',
126
+ 'research projects',
127
+ 'personal projects',
128
+ 'current research interests',
129
+ 'thesis',
130
+ 'theses',
131
+ )
132
+
133
+
134
+ def find_segment_indices(self, string_to_search, resume_segments, resume_indices):
135
+ for i, line in enumerate(string_to_search):
136
+
137
+ if line[0].islower():
138
+ continue
139
+
140
+ header = line.lower()
141
+
142
+ if [o for o in self.objective if header.startswith(o)]:
143
+ try:
144
+ resume_segments['objective'][header]
145
+ except:
146
+ resume_indices.append(i)
147
+ header = [o for o in self.objective if header.startswith(o)][0]
148
+ resume_segments['objective'][header] = i
149
+ elif [w for w in self.work_and_employment if header.startswith(w)]:
150
+ try:
151
+ resume_segments['work_and_employment'][header]
152
+ except:
153
+ resume_indices.append(i)
154
+ header = [w for w in self.work_and_employment if header.startswith(w)][0]
155
+ resume_segments['work_and_employment'][header] = i
156
+ elif [e for e in self.education_and_training if header.startswith(e)]:
157
+ try:
158
+ resume_segments['education_and_training'][header]
159
+ except:
160
+ resume_indices.append(i)
161
+ header = [e for e in self.education_and_training if header.startswith(e)][0]
162
+ resume_segments['education_and_training'][header] = i
163
+ elif [s for s in self.skills_header if header.startswith(s)]:
164
+ try:
165
+ resume_segments['skills'][header]
166
+ except:
167
+ resume_indices.append(i)
168
+ header = [s for s in self.skills_header if header.startswith(s)][0]
169
+ resume_segments['skills'][header] = i
170
+ elif [m for m in self.misc if header.startswith(m)]:
171
+ try:
172
+ resume_segments['misc'][header]
173
+ except:
174
+ resume_indices.append(i)
175
+ header = [m for m in self.misc if header.startswith(m)][0]
176
+ resume_segments['misc'][header] = i
177
+ elif [a for a in self.accomplishments if header.startswith(a)]:
178
+ try:
179
+ resume_segments['accomplishments'][header]
180
+ except:
181
+ resume_indices.append(i)
182
+ header = [a for a in self.accomplishments if header.startswith(a)][0]
183
+ resume_segments['accomplishments'][header] = i
184
+
185
+ def slice_segments(self, string_to_search, resume_segments, resume_indices):
186
+ resume_segments['contact_info'] = string_to_search[:resume_indices[0]]
187
+ sec_idxs = {}
188
+ for section, value in resume_segments.items():
189
+ if section == 'contact_info':
190
+ continue
191
+
192
+ for sub_section, start_idx in value.items():
193
+ end_idx = len(string_to_search)
194
+ if (resume_indices.index(start_idx) + 1) != len(resume_indices):
195
+ end_idx = resume_indices[resume_indices.index(start_idx) + 1]
196
+
197
+ sec_idxs[section] = (start_idx, end_idx)
198
+ # print(start_idx, end_idx)
199
+
200
+ resume_segments[section][sub_section] = string_to_search[start_idx:end_idx]
201
+ return sec_idxs
202
+
203
+ def find_true_segment(self, dict_of_segments, segment_name):
204
+ segment_classes = {
205
+ 'objective': ["objective", "other"],
206
+ 'work_and_employment':["employment history", "other"],
207
+ 'education_and_training': ["education", "other"],
208
+ 'skills': ["skills", "other"],
209
+ 'accomplishments': ["accomplishments", "other"],
210
+ 'misc': ["misc", "other"],
211
+ 'contact_info': ["contact information", "other"]
212
+ }
213
+ classes = segment_classes[segment_name]
214
+ scores = []
215
+ segs = dict_of_segments.keys()
216
+ for seg in segs:
217
+ sequence = dict_of_segments[seg]
218
+ score = self.zero_shot_classifier(' '.join(sequence), classes)["scores"][0]
219
+ scores.append(score)
220
+
221
+ res = sorted(zip(dict_of_segments.keys(), scores), key=lambda x: x[1], reverse=True)
222
+ if len(res):
223
+ return res[0][0]
224
+ else: return 0
225
+
226
+ def segment(self, string_to_search):
227
+ print("Segmenting the Resume..")
228
+ resume_segments = {
229
+ 'objective': {},
230
+ 'work_and_employment': {},
231
+ 'education_and_training': {},
232
+ 'skills': {},
233
+ 'accomplishments': {},
234
+ 'misc': {}
235
+ }
236
+
237
+ resume_indices = []
238
+
239
+ self.find_segment_indices(string_to_search, resume_segments, resume_indices)
240
+ if len(resume_indices) != 0:
241
+ sec_idx = self.slice_segments(string_to_search, resume_segments, resume_indices)
242
+ else:
243
+ resume_segments['contact_info'] = []
244
+
245
+ for segment in resume_segments:
246
+ if segment == "contact_info": continue
247
+ if not len(resume_segments[segment]) > 1:
248
+ if len(resume_segments[segment]) == 1:
249
+ only_key = list(resume_segments[segment].keys())[0]
250
+ resume_segments[segment] = resume_segments[segment][only_key][1:]
251
+ continue
252
+ if segment != "work_and_employment": continue
253
+ true_seg = self.find_true_segment(resume_segments[segment], segment)
254
+ if not true_seg:
255
+ resume_segments[segment] = []
256
+ else:
257
+ resume_segments[segment] = resume_segments[segment][true_seg][1:]
258
+
259
+ return resume_segments
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydoc import describe
2
+ import gradio as gr
3
+ from Main import Main
4
+
5
+
6
+ main = Main()
7
+
8
+ def parse_cv(cv):
9
+ return main.parse_cv(cv.name)
10
+
11
+
12
+ description = """A demo for a CV parser built with HuggingFace's transformers."""
13
+ article = "Find the code on GitHub <a href='https://github.com/asimokby/cv-parser-huggingface'>here</a>."
14
+ file_input = gr.inputs.File(file_count="single", type="file", label="Upload a CV", optional=False)
15
+ iface = gr.Interface(fn=parse_cv, inputs=file_input, outputs="json", allow_flagging="never",
16
+ allow_screenshot=False, title="CV Parser", theme="dark", description=description, article=article)
17
+
18
+ iface.launch()