Keshav4 Sybghat commited on
Commit
643a815
0 Parent(s):

Duplicate from Sybghat/resume-parser

Browse files

Co-authored-by: Sybghat Ullah <Sybghat@users.noreply.huggingface.co>

Files changed (10) hide show
  1. .gitattributes +34 -0
  2. Models.py +58 -0
  3. README.md +14 -0
  4. ResumeParser.py +258 -0
  5. ResumeReader.py +103 -0
  6. ResumeSegmenter.py +264 -0
  7. app.py +18 -0
  8. main.py +23 -0
  9. readMe.txt +3 -0
  10. requirements.txt +111 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
2
+ from transformers import pipeline
3
+ from flair.data import Sentence
4
+ from flair.models import SequenceTagger
5
+ import pickle
6
+
7
+
8
+
9
+ class Models:
10
+
11
+ def pickle_it(self, obj, file_name):
12
+ with open(f'{file_name}.pickle', 'wb') as f:
13
+ pickle.dump(obj, f)
14
+
15
+ def unpickle_it(self, file_name):
16
+ with open(f'{file_name}.pickle', 'rb') as f:
17
+ return pickle.load(f)
18
+
19
+ def load_trained_models(self, pickle=False):
20
+ #NER (dates)
21
+ tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
22
+ model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
23
+ self.ner_dates = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
24
+
25
+ #Zero Shot Classification
26
+ # self.zero_shot_classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli')
27
+ self.zero_shot_classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-6')
28
+
29
+ # Ner
30
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
31
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
32
+ self.ner = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
33
+
34
+ # Pos Tagging
35
+ self.tagger = SequenceTagger.load("flair/pos-english-fast")
36
+
37
+
38
+ if pickle:
39
+ self.pickle_models()
40
+
41
+ return self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger
42
+
43
+ def pickle_models(self):
44
+ self.pickle_it(self.ner, "ner")
45
+ self.pickle_it(self.zero_shot_classifier, "zero_shot_classifier_6")
46
+ self.pickle_it(self.ner_dates, "ner_dates")
47
+ self.pickle_it(self.tagger, "pos_tagger_fast")
48
+
49
+
50
+ def load_pickled_models(self):
51
+ ner_dates = self.unpickle_it('ner_dates')
52
+ ner = self.unpickle_it('ner')
53
+ zero_shot_classifier = self.unpickle_it('zero_shot_classifier_6')
54
+ tagger = self.unpickle_it("pos_tagger_fast")
55
+ return ner_dates, ner, zero_shot_classifier, tagger
56
+
57
+ def get_flair_sentence(self, sent):
58
+ return Sentence(sent)
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Resume Parser
3
+ emoji: 🏢
4
+ colorFrom: pink
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: Sybghat/resume-parser
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
ResumeParser.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+ from ResumeSegmenter import ResumeSegmenter
3
+ from datetime import datetime
4
+ from dateutil import parser
5
+ import re
6
+ from string import punctuation
7
+
8
+ class ResumeParser:
9
+ def __init__(self, ner, ner_dates, zero_shot_classifier, tagger):
10
+ self.models = Models()
11
+ self.segmenter = ResumeSegmenter(zero_shot_classifier)
12
+ self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger = ner, ner_dates, zero_shot_classifier, tagger
13
+ self.parsed_cv = {}
14
+
15
+ def parse(self, resume_lines):
16
+ resume_segments = self.segmenter.segment(resume_lines)
17
+ print("***************************** Parsing the Resume...***************************** ")
18
+ for segment_name in resume_segments:
19
+ if segment_name == "work_and_employment":
20
+ resume_segment = resume_segments[segment_name]
21
+ self.parse_job_history(resume_segment)
22
+ elif segment_name == "contact_info":
23
+ contact_info = resume_segments[segment_name]
24
+ self.parse_contact_info(contact_info)
25
+ elif segment_name == "education_and_training":
26
+ education_and_training = resume_segments[segment_name]
27
+ self.parse_education(education_and_training)
28
+ elif segment_name == "skills_header":
29
+ skills_header = resume_segments[segment_name]
30
+ self.parse_skills(skills_header)
31
+ print("************************************** SKILLS HEADER ***************************** <br>",skills_header)
32
+ return self.parsed_cv
33
+
34
+ def parse_education(self, education_and_training):
35
+ print(education_and_training)
36
+ self.parsed_cv['Education'] = education_and_training
37
+
38
+ def parse_skills(self, skills_header):
39
+ self.parsed_cv['Skills'] = skills_header
40
+
41
+ def parse_contact_info(self, contact_info):
42
+ contact_info_dict = {}
43
+ name = self.find_person_name(contact_info)
44
+ email = self.find_contact_email(contact_info)
45
+ self.parsed_cv['Name'] = name
46
+ contact_info_dict["Email"] = email
47
+ self.parsed_cv['Contact Info'] = contact_info_dict
48
+
49
+ def find_person_name(self, items):
50
+ class_score = []
51
+ splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
52
+ classes = ["person name", "address", "email", "title"]
53
+ for item in items:
54
+ elements = splitter.split(item)
55
+ for element in elements:
56
+ element = ''.join(i for i in element.strip() if not i.isdigit())
57
+ if not len(element.strip().split()) > 1: continue
58
+ out = self.zero_shot_classifier(element, classes)
59
+ highest = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1])[-1]
60
+ if highest[0] == "person name":
61
+ class_score.append((element, highest[1]))
62
+ if len(class_score):
63
+ return sorted(class_score, key=lambda x: x[1], reverse=True)[0][0]
64
+ return ""
65
+
66
+ def find_contact_email(self, items):
67
+ for item in items:
68
+ match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', item)
69
+ if match:
70
+ return match.group(0)
71
+ return ""
72
+
73
+ def parse_job_history(self, resume_segment):
74
+ idx_job_title = self.get_job_titles(resume_segment)
75
+ current_and_below = False
76
+ if not len(idx_job_title):
77
+ self.parsed_cv["Job History"] = []
78
+ return
79
+ if idx_job_title[0][0] == 0: current_and_below = True
80
+ job_history = []
81
+ for ls_idx, (idx, job_title) in enumerate(idx_job_title):
82
+ job_info = {}
83
+ # print("<br> Job Title: ",job_title)
84
+ job_info["Job Title"] = self.filter_job_title(job_title)
85
+ # company
86
+ if current_and_below: line1, line2 = idx, idx+1
87
+ else: line1, line2 = idx, idx-1
88
+ job_info["Company"] = self.get_job_company(line1, line2, resume_segment)
89
+ if current_and_below: st_span = idx
90
+ else: st_span = idx-1
91
+ # Dates
92
+ if ls_idx == len(idx_job_title) - 1: end_span = len(resume_segment)
93
+ else: end_span = idx_job_title[ls_idx+1][0]
94
+ start, end = self.get_job_dates(st_span, end_span, resume_segment)
95
+ job_info["Start Date"] = start
96
+ job_info["End Date"] = end
97
+ # if(start != "" and end != ""):
98
+ job_history.append(job_info)
99
+ self.parsed_cv["Job History"] = job_history
100
+
101
+ def get_job_titles(self, resume_segment):
102
+ classes = ["organization", "institution", "company", "job title", "work details"]
103
+ idx_line = []
104
+ for idx, line in enumerate(resume_segment):
105
+ has_verb = False
106
+ line_modifed = ''.join(i for i in line if not i.isdigit())
107
+ sentence = self.models.get_flair_sentence(line_modifed)
108
+ self.tagger.predict(sentence)
109
+ tags = []
110
+ for entity in sentence.get_spans('pos'):
111
+ tags.append(entity.tag)
112
+ if entity.tag.startswith("V"):
113
+ has_verb = True
114
+
115
+ most_common_tag = max(set(tags), key=tags.count)
116
+ if (most_common_tag == "NNP") or (most_common_tag == "NN"):
117
+ # if most_common_tag == "NNP":
118
+ if not has_verb:
119
+ out = self.zero_shot_classifier(line, classes)
120
+ class_score = zip(out["labels"], out["scores"])
121
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
122
+
123
+ if (highest[0] == "job title") or (highest[0] == "organization"):
124
+ # if highest[0] == "job title":
125
+ idx_line.append((idx, line))
126
+ return idx_line
127
+
128
+ def get_job_dates(self, st, end, resume_segment):
129
+ search_span = resume_segment[st:end]
130
+ dates = []
131
+ for line in search_span:
132
+ for dt in self.get_ner_in_line(line, "DATE"):
133
+ if self.isvalidyear(dt.strip()):
134
+ dates.append(dt)
135
+ if len(dates): first = dates[0]
136
+ exists_second = False
137
+ if len(dates) > 1:
138
+ exists_second = True
139
+ second = dates[1]
140
+
141
+ if len(dates) > 0:
142
+ if self.has_two_dates(first):
143
+ d1, d2 = self.get_two_dates(first)
144
+ return self.format_date(d1), self.format_date(d2)
145
+ elif exists_second and self.has_two_dates(second):
146
+ d1, d2 = self.get_two_dates(second)
147
+ return self.format_date(d1), self.format_date(d2)
148
+ else:
149
+ if exists_second:
150
+ st = self.format_date(first)
151
+ end = self.format_date(second)
152
+ return st, end
153
+ else:
154
+ return (self.format_date(first), "")
155
+ else: return ("", "")
156
+
157
+
158
+
159
+ def filter_job_title(self, job_title):
160
+ job_title_splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
161
+ job_title = ''.join(i for i in job_title if not i.isdigit())
162
+ tokens = job_title_splitter.split(job_title)
163
+ tokens = [''.join([i for i in tok.strip() if (i.isalpha() or i.strip()=="")]) for tok in tokens if tok.strip()]
164
+ classes = ["company", "organization", "institution", "job title", "responsibility", "details"]
165
+ new_title = []
166
+ for token in tokens:
167
+ if not token: continue
168
+ res = self.zero_shot_classifier(token, classes)
169
+ class_score = zip(res["labels"], res["scores"])
170
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
171
+ if (highest[0] == "job title") or (highest[0] == "organization"):
172
+ # if highest[0] == "job title":
173
+ new_title.append(token.strip())
174
+ if len(new_title):
175
+ return ', '.join(new_title)
176
+ else: return ', '.join(tokens)
177
+
178
+ def has_two_dates(self, date):
179
+ years = self.get_valid_years()
180
+ count = 0
181
+ for year in years:
182
+ if year in str(date):
183
+ count+=1
184
+ return count == 2
185
+
186
+ def get_two_dates(self, date):
187
+ years = self.get_valid_years()
188
+ idxs = []
189
+ for year in years:
190
+ if year in date:
191
+ idxs.append(date.index(year))
192
+ min_idx = min(idxs)
193
+ first = date[:min_idx+4]
194
+ second = date[min_idx+4:]
195
+ return first, second
196
+ def get_valid_years(self):
197
+ current_year = datetime.today().year
198
+ years = [str(i) for i in range(current_year-100, current_year)]
199
+ return years
200
+
201
+ def format_date(self, date):
202
+ out = self.parse_date(date)
203
+ if out:
204
+ return out
205
+ else:
206
+ date = self.clean_date(date)
207
+ out = self.parse_date(date)
208
+ if out:
209
+ return out
210
+ else:
211
+ return date
212
+
213
+ def clean_date(self, date):
214
+ try:
215
+ date = ''.join(i for i in date if i.isalnum() or i =='-' or i == '/')
216
+ return date
217
+ except:
218
+ return date
219
+
220
+ def parse_date(self, date):
221
+ try:
222
+ date = parser.parse(date)
223
+ return date.strftime("%m-%Y")
224
+ except:
225
+ try:
226
+ date = datetime(date)
227
+ return date.strftime("%m-%Y")
228
+ except:
229
+ return 0
230
+
231
+
232
+ def isvalidyear(self, date):
233
+ current_year = datetime.today().year
234
+ years = [str(i) for i in range(current_year-100, current_year)]
235
+ for year in years:
236
+ if year in str(date):
237
+ return True
238
+ return False
239
+
240
+ def get_ner_in_line(self, line, entity_type):
241
+ if entity_type == "DATE": ner = self.ner_dates
242
+ else: ner = self.ner
243
+ return [i['word'] for i in ner(line) if i['entity_group'] == entity_type]
244
+
245
+
246
+ def get_job_company(self, idx, idx1, resume_segment):
247
+ job_title = resume_segment[idx]
248
+ if not idx1 <= len(resume_segment)-1: context = ""
249
+ else:context = resume_segment[idx1]
250
+ candidate_companies = self.get_ner_in_line(job_title, "ORG") + self.get_ner_in_line(context, "ORG")
251
+ classes = ["organization", "company", "institution", "not organization", "not company", "not institution"]
252
+ scores = []
253
+ for comp in candidate_companies:
254
+ res = self.zero_shot_classifier(comp, classes)['scores']
255
+ scores.append(max(res[:3]))
256
+ sorted_cmps = sorted(zip(candidate_companies, scores), key=lambda x: x[1], reverse=True)
257
+ if len(sorted_cmps): return sorted_cmps[0][0]
258
+ return context
ResumeReader.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ import pdfplumber
5
+ import fitz
6
+
7
+ class ResumeReader:
8
+
9
+ def convert_docx_to_txt(self, docx_file,docx_parser):
10
+ """
11
+ A utility function to convert a Microsoft docx files to raw text.
12
+
13
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
14
+ :param docx_file: docx file with gets uploaded by the user
15
+ :type docx_file: InMemoryUploadedFile
16
+ :return: The text contents of the docx file
17
+ :rtype: str
18
+ """
19
+
20
+ # doc = docx.Document(docx_file)
21
+ # allText = []
22
+ # for docpara in doc.paragraphs:
23
+ # allText.append(docpara.text)
24
+ # text = ' '.join(allText)
25
+ text = ""
26
+ try:
27
+ clean_text = re.sub(r'\n+', '\n', text)
28
+ clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
29
+ resume_lines = clean_text.splitlines() # Split text blob into individual lines
30
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
31
+ line.strip()] # Remove empty strings and whitespaces
32
+ return resume_lines, text
33
+ except Exception as e:
34
+ logging.error('Error in docx file:: ' + str(e))
35
+ return [], " "
36
+
37
+ def convert_pdf_to_txt(self, pdf_file):
38
+ """
39
+ A utility function to convert a machine-readable PDF to raw text.
40
+
41
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
42
+ :param input_pdf_path: Path to the .pdf file which should be converted
43
+ :type input_pdf_path: str
44
+ :return: The text contents of the pdf
45
+ :rtype: str
46
+ """
47
+
48
+ pdf = pdfplumber.open(pdf_file)
49
+ raw_text= ""
50
+ with fitz.open(pdf_file) as doc:
51
+ for page in doc:
52
+ raw_text += page.get_text()
53
+ print(raw_text)
54
+ # for page in pdf.pages:
55
+ # raw_text += page.extract_text() + "\n"
56
+
57
+ pdf.close()
58
+
59
+ try:
60
+ full_string = re.sub(r'\n+', '\n', raw_text)
61
+ full_string = full_string.replace("\r", "\n")
62
+ full_string = full_string.replace("\t", " ")
63
+
64
+ # Remove awkward LaTeX bullet characters
65
+ full_string = re.sub(r"\uf0b7", " ", full_string)
66
+ full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
67
+ full_string = re.sub(r'• ', " ", full_string)
68
+
69
+ # Split text blob into individual lines
70
+ resume_lines = full_string.splitlines(True)
71
+
72
+ # Remove empty strings and whitespaces
73
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
74
+
75
+ return resume_lines, raw_text
76
+ except Exception as e:
77
+ logging.error('Error in docx file:: ' + str(e))
78
+ return [], " "
79
+
80
+ def read_file(self, file,docx_parser = "tika"):
81
+ """
82
+ file : Give path of resume file
83
+ docx_parser : Enter docx2txt or tika, by default is tika
84
+ """
85
+ print("Reading the Resume...")
86
+ # file = "/content/Asst Manager Trust Administration.docx"
87
+ file = os.path.join(file)
88
+ if file.endswith('docx') or file.endswith('doc'):
89
+ # if file.endswith('doc') and docx_parser == "docx2txt":
90
+ # docx_parser = "tika"
91
+ # logging.error("doc format not supported by the docx2txt changing back to tika")
92
+ resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
93
+ elif file.endswith('pdf'):
94
+ resume_lines, raw_text = self.convert_pdf_to_txt(file)
95
+ elif file.endswith('txt'):
96
+ with open(file, 'r', encoding='utf-8') as f:
97
+ resume_lines = f.readlines()
98
+
99
+ else:
100
+ resume_lines = None
101
+
102
+
103
+ return resume_lines
ResumeSegmenter.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+
3
+ class ResumeSegmenter:
4
+
5
+ def __init__(self, zero_shot_classifier):
6
+ self.zero_shot_classifier = zero_shot_classifier
7
+
8
+ objective = (
9
+ 'career goal',
10
+ 'objective',
11
+ 'career objective',
12
+ 'employment objective',
13
+ 'professional objective',
14
+ 'summary',
15
+ 'summary of qualifications',
16
+ 'digital',
17
+ 'interests'
18
+ )
19
+
20
+ work_and_employment = (
21
+ 'employment history',
22
+ 'employment data',
23
+ 'career summary',
24
+ 'work history',
25
+ 'working history',
26
+ 'work experience',
27
+ 'experience',
28
+ 'professional experience',
29
+ 'professional background',
30
+ 'professional employment',
31
+ 'additional experience',
32
+ 'career related experience',
33
+ "professional employment history",
34
+ 'related experience',
35
+ 'relevant experience',
36
+ 'programming experience',
37
+ 'freelance',
38
+ 'freelance experience',
39
+ 'army experience',
40
+ 'military experience',
41
+ 'military background',
42
+ )
43
+
44
+ education_and_training = (
45
+ 'academic background',
46
+ 'academic experience',
47
+ 'programs',
48
+ 'courses',
49
+ 'related courses',
50
+ 'education',
51
+ 'educational background',
52
+ 'educational qualifications',
53
+ 'educational training',
54
+ 'education and training',
55
+ 'training',
56
+ 'academic training',
57
+ 'Academic Qualification',
58
+ 'professional training',
59
+ 'course project experience',
60
+ 'related course projects',
61
+ 'internship experience',
62
+ 'internships',
63
+ 'apprenticeships',
64
+ 'college activities',
65
+ 'certifications',
66
+ 'special training',
67
+ )
68
+
69
+ skills_header = (
70
+ 'credentials',
71
+ 'qualifications',
72
+ 'areas of experience',
73
+ 'areas of expertise',
74
+ 'areas of knowledge',
75
+ 'skills',
76
+ 'Skills',
77
+ "other skills",
78
+ "other abilities",
79
+ 'career related skills',
80
+ 'professional skills',
81
+ 'specialized skills',
82
+ 'technical skills',
83
+ 'computer skills',
84
+ 'personal skills',
85
+ 'computer knowledge',
86
+ 'technologies',
87
+ 'technical experience',
88
+ 'proficiencies',
89
+ 'languages',
90
+ 'language competencies and skills',
91
+ 'programming languages',
92
+ 'competencies'
93
+ )
94
+
95
+ misc = (
96
+ 'activities and honors',
97
+ 'activities',
98
+ 'affiliations',
99
+ 'professional affiliations',
100
+ 'associations',
101
+ 'professional associations',
102
+ 'memberships',
103
+ 'professional memberships',
104
+ 'athletic involvement',
105
+ 'community involvement',
106
+ 'refere',
107
+ 'civic activities',
108
+ 'extra-Curricular activities',
109
+ 'professional activities',
110
+ 'volunteer work',
111
+ 'volunteer experience',
112
+ 'additional information',
113
+ 'interests'
114
+ )
115
+
116
+ accomplishments = (
117
+ 'achievement',
118
+ 'awards and achievements',
119
+ 'licenses',
120
+ 'presentations',
121
+ 'conference presentations',
122
+ 'conventions',
123
+ 'dissertations',
124
+ 'exhibits',
125
+ 'papers',
126
+ 'publications',
127
+ 'professional publications',
128
+ 'research experience',
129
+ 'research grants',
130
+ 'project',
131
+ 'research projects',
132
+ 'personal projects',
133
+ 'current research interests',
134
+ 'thesis',
135
+ 'theses',
136
+ )
137
+
138
+
139
+ def find_segment_indices(self, string_to_search, resume_segments, resume_indices):
140
+ for i, line in enumerate(string_to_search):
141
+
142
+ if line[0].islower():
143
+ continue
144
+
145
+ header = line.lower()
146
+
147
+ if [o for o in self.objective if header.startswith(o)]:
148
+ try:
149
+ resume_segments['objective'][header]
150
+ except:
151
+ resume_indices.append(i)
152
+ header = [o for o in self.objective if header.startswith(o)][0]
153
+ resume_segments['objective'][header] = i
154
+ elif [w for w in self.work_and_employment if header.startswith(w)]:
155
+ try:
156
+ resume_segments['work_and_employment'][header]
157
+ except:
158
+ resume_indices.append(i)
159
+ header = [w for w in self.work_and_employment if header.startswith(w)][0]
160
+ resume_segments['work_and_employment'][header] = i
161
+ elif [e for e in self.education_and_training if header.startswith(e)]:
162
+ try:
163
+ resume_segments['education_and_training'][header]
164
+ except:
165
+ resume_indices.append(i)
166
+ header = [e for e in self.education_and_training if header.startswith(e)][0]
167
+ resume_segments['education_and_training'][header] = i
168
+ elif [s for s in self.skills_header if header.startswith(s)]:
169
+ try:
170
+ resume_segments['skills'][header]
171
+ except:
172
+ resume_indices.append(i)
173
+ header = [s for s in self.skills_header if header.startswith(s)][0]
174
+ resume_segments['skills'][header] = i
175
+ elif [m for m in self.misc if header.startswith(m)]:
176
+ try:
177
+ resume_segments['misc'][header]
178
+ except:
179
+ resume_indices.append(i)
180
+ header = [m for m in self.misc if header.startswith(m)][0]
181
+ resume_segments['misc'][header] = i
182
+ elif [a for a in self.accomplishments if header.startswith(a)]:
183
+ try:
184
+ resume_segments['accomplishments'][header]
185
+ except:
186
+ resume_indices.append(i)
187
+ header = [a for a in self.accomplishments if header.startswith(a)][0]
188
+ resume_segments['accomplishments'][header] = i
189
+
190
+ def slice_segments(self, string_to_search, resume_segments, resume_indices):
191
+ resume_segments['contact_info'] = string_to_search[:resume_indices[0]]
192
+ sec_idxs = {}
193
+ for section, value in resume_segments.items():
194
+ if section == 'contact_info':
195
+ continue
196
+
197
+ for sub_section, start_idx in value.items():
198
+ end_idx = len(string_to_search)
199
+ if (resume_indices.index(start_idx) + 1) != len(resume_indices):
200
+ end_idx = resume_indices[resume_indices.index(start_idx) + 1]
201
+
202
+ sec_idxs[section] = (start_idx, end_idx)
203
+ # print(start_idx, end_idx)
204
+
205
+ resume_segments[section][sub_section] = string_to_search[start_idx:end_idx]
206
+ return sec_idxs
207
+
208
+ def find_true_segment(self, dict_of_segments, segment_name):
209
+ segment_classes = {
210
+ 'objective': ["objective", "other"],
211
+ 'work_and_employment':["employment history", "other"],
212
+ 'education_and_training': ["education", "other"],
213
+ 'skills': ["skills", "other"],
214
+ 'accomplishments': ["accomplishments", "other"],
215
+ 'misc': ["misc", "other"],
216
+ 'contact_info': ["contact information", "other"]
217
+ }
218
+ classes = segment_classes[segment_name]
219
+ scores = []
220
+ segs = dict_of_segments.keys()
221
+ for seg in segs:
222
+ sequence = dict_of_segments[seg]
223
+ score = self.zero_shot_classifier(' '.join(sequence), classes)["scores"][0]
224
+ scores.append(score)
225
+
226
+ res = sorted(zip(dict_of_segments.keys(), scores), key=lambda x: x[1], reverse=True)
227
+ if len(res):
228
+ return res[0][0]
229
+ else: return 0
230
+
231
+ def segment(self, string_to_search):
232
+ print("Segmenting the Resume..")
233
+ resume_segments = {
234
+ 'objective': {},
235
+ 'work_and_employment': {},
236
+ 'education_and_training': {},
237
+ 'skills': {},
238
+ 'accomplishments': {},
239
+ 'misc': {}
240
+ }
241
+
242
+ resume_indices = []
243
+
244
+ self.find_segment_indices(string_to_search, resume_segments, resume_indices)
245
+ if len(resume_indices) != 0:
246
+ sec_idx = self.slice_segments(string_to_search, resume_segments, resume_indices)
247
+ else:
248
+ resume_segments['contact_info'] = []
249
+
250
+ for segment in resume_segments:
251
+ if segment == "contact_info": continue
252
+ if not len(resume_segments[segment]) > 1:
253
+ if len(resume_segments[segment]) == 1:
254
+ only_key = list(resume_segments[segment].keys())[0]
255
+ resume_segments[segment] = resume_segments[segment][only_key][1:]
256
+ continue
257
+ if segment != "work_and_employment": continue
258
+ true_seg = self.find_true_segment(resume_segments[segment], segment)
259
+ if not true_seg:
260
+ resume_segments[segment] = []
261
+ else:
262
+ resume_segments[segment] = resume_segments[segment][true_seg][1:]
263
+
264
+ return resume_segments
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydoc import describe
2
+ import gradio as gr
3
+ from main import Main
4
+
5
+
6
+ main = Main()
7
+
8
+ def parse_cv(cv):
9
+ return main.parse_cv(cv.name)
10
+
11
+
12
+ description = """A demo for a CV parser."""
13
+ article = "Resume Parser by Sybghat"
14
+ file_input = gr.inputs.File(file_count="single", type="file", label="Upload a CV: .PDF Or .TXT", optional=False)
15
+ iface = gr.Interface(fn=parse_cv, inputs=file_input, outputs="json", allow_flagging="never",
16
+ allow_screenshot=False, title="CV Parser", theme="seafoam", description=description, article=article)
17
+
18
+ iface.launch()
main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ResumeReader import ResumeReader
2
+ from ResumeParser import ResumeParser
3
+ from Models import Models
4
+ import json
5
+ import os
6
+
7
+
8
+ class Main:
9
+ def __init__(self):
10
+ models = Models()
11
+ ner, ner_dates, zero_shot_classifier, tagger = models.load_trained_models()
12
+ self.reader = ResumeReader()
13
+ self.parser = ResumeParser(ner, ner_dates, zero_shot_classifier, tagger)
14
+
15
+ def parse_cv(self, file_path):
16
+ resume_lines = self.reader.read_file(file_path)
17
+ output = self.parser.parse(resume_lines)
18
+ return output
19
+
20
+ def save_parse_as_json(self, dict, file_name):
21
+ print("Saving the parse...")
22
+ with open(file_name, 'w', encoding="utf-8") as f:
23
+ json.dump(dict, f, indent=4, default=str, ensure_ascii=False)
readMe.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Activate Virtual environment:
2
+ resume-parser/Scripts/activate.bat // CMD
3
+ resume-parser/Scripts/activate.ps1 //Powershell
requirements.txt ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip==22.3.1
2
+ aiohttp==3.8.1
3
+ aiosignal==1.2.0
4
+ analytics-python==1.4.0
5
+ anyio==3.5.0
6
+ asgiref==3.5.0
7
+ async-timeout==4.0.2
8
+ attrs==21.4.0
9
+ backoff==1.10.0
10
+ bcrypt==3.2.0
11
+ bpemb==0.3.3
12
+ certifi==2021.10.8
13
+ cffi==1.15.0
14
+ chardet==4.0.0
15
+ charset-normalizer==2.0.11
16
+ click==8.0.3
17
+ colorama==0.4.4
18
+ coloredlogs==15.0.1
19
+ conllu==4.4.1
20
+ cryptography==36.0.1
21
+ cycler==0.11.0
22
+ Cython==0.29.23
23
+ Deprecated==1.2.13
24
+ doc2text==0.2.4
25
+ fastapi==0.73.0
26
+ ffmpy==0.3.0
27
+ filelock==3.4.2
28
+ flair==0.10
29
+ flatbuffers==2.0
30
+ fonttools==4.29.1
31
+ frozenlist==1.3.0
32
+ ftfy==6.0.3
33
+ future==0.18.2
34
+ gdown==3.12.2
35
+ gensim==4.1.2
36
+ gradio==2.7.5.2
37
+ h11==0.13.0
38
+ huggingface-hub==0.4.0
39
+ humanfriendly==10.0
40
+ idna==3.3
41
+ importlib-metadata==3.10.1
42
+ Janome==0.4.1
43
+ Jinja2==3.0.3
44
+ joblib==1.1.0
45
+ kiwisolver==1.3.2
46
+ konoha==4.6.5
47
+ langdetect==1.0.9
48
+ markdown2==2.4.2
49
+ MarkupSafe==2.0.1
50
+ matplotlib==3.5.1
51
+ mime==0.1.0
52
+ monotonic==1.6
53
+ more-itertools==8.8.0
54
+ mpld3==0.3
55
+ multidict==6.0.2
56
+ numpy==1.22.1
57
+ overrides==3.1.0
58
+ packaging==21.3
59
+ pandas==1.4.0
60
+ paramiko==2.9.2
61
+ pdfminer.six==20211012
62
+ pdfplumber==0.6.0
63
+ Pillow==9.0.1
64
+ protobuf==3.19.4
65
+ psutil==5.9.0
66
+ py-cpuinfo==8.0.0
67
+ py3nvml==0.2.7
68
+ pycparser==2.21
69
+ pycryptodome==3.14.1
70
+ pydantic==1.9.0
71
+ pydub==0.25.1
72
+ PyNaCl==1.5.0
73
+ pyparsing==3.0.7
74
+ PyPDF2==1.26.0
75
+ pyreadline3==3.4.1
76
+ PySocks==1.7.1
77
+ pytesseract==0.3.8
78
+ python-dateutil==2.8.2
79
+ python-multipart==0.0.5
80
+ pytz==2021.3
81
+ PyYAML==6.0
82
+ regex==2022.1.18
83
+ requests==2.27.1
84
+ sacremoses==0.0.47
85
+ scikit-learn==1.0.2
86
+ scipy==1.7.3
87
+ segtok==1.5.11
88
+ sentencepiece==0.1.95
89
+ six==1.16.0
90
+ smart-open==5.2.1
91
+ sniffio==1.2.0
92
+ sqlitedict==1.7.0
93
+ starlette==0.17.1
94
+ tabulate==0.8.9
95
+ threadpoolctl==3.1.0
96
+ tokenizers==0.10.3
97
+ torch==1.10.2
98
+ tqdm==4.62.3
99
+ transformers==4.15.0
100
+ typing_extensions==4.0.1
101
+ urllib3==1.26.8
102
+ uvicorn==0.17.4
103
+ Wand==0.6.7
104
+ wcwidth==0.2.5
105
+ Wikipedia-API==0.5.4
106
+ wincertstore==0.2
107
+ wrapt==1.13.3
108
+ xmltodict==0.12.0
109
+ yarl==1.7.2
110
+ zipp==3.7.0
111
+ PyMuPDF==1.19.0