Basit34 asimokby commited on
Commit
239854d
·
0 Parent(s):

Duplicate from asimokby/cv-parser-huggingface

Browse files

Co-authored-by: Asem <asimokby@users.noreply.huggingface.co>

Files changed (9) hide show
  1. .gitattributes +27 -0
  2. Main.py +23 -0
  3. Models.py +58 -0
  4. README.md +47 -0
  5. ResumeParser.py +241 -0
  6. ResumeReader.py +99 -0
  7. ResumeSegmenter.py +259 -0
  8. app.py +18 -0
  9. requirements.txt +109 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ResumeReader import ResumeReader
2
+ from ResumeParser import ResumeParser
3
+ from Models import Models
4
+ import json
5
+ import os
6
+
7
+
8
+ class Main:
9
+ def __init__(self):
10
+ models = Models()
11
+ ner, ner_dates, zero_shot_classifier, tagger = models.load_trained_models()
12
+ self.reader = ResumeReader()
13
+ self.parser = ResumeParser(ner, ner_dates, zero_shot_classifier, tagger)
14
+
15
+ def parse_cv(self, file_path):
16
+ resume_lines = self.reader.read_file(file_path)
17
+ output = self.parser.parse(resume_lines)
18
+ return output
19
+
20
+ def save_parse_as_json(self, dict, file_name):
21
+ print("Saving the parse...")
22
+ with open(file_name, 'w', encoding="utf-8") as f:
23
+ json.dump(dict, f, indent=4, default=str, ensure_ascii=False)
Models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
2
+ from transformers import pipeline
3
+ from flair.data import Sentence
4
+ from flair.models import SequenceTagger
5
+ import pickle
6
+
7
+
8
+
9
+ class Models:
10
+
11
+ def pickle_it(self, obj, file_name):
12
+ with open(f'{file_name}.pickle', 'wb') as f:
13
+ pickle.dump(obj, f)
14
+
15
+ def unpickle_it(self, file_name):
16
+ with open(f'{file_name}.pickle', 'rb') as f:
17
+ return pickle.load(f)
18
+
19
+ def load_trained_models(self, pickle=False):
20
+ #NER (dates)
21
+ tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
22
+ model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
23
+ self.ner_dates = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
24
+
25
+ #Zero Shot Classification
26
+ # self.zero_shot_classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli')
27
+ self.zero_shot_classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-6')
28
+
29
+ # Ner
30
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
31
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
32
+ self.ner = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
33
+
34
+ # Pos Tagging
35
+ self.tagger = SequenceTagger.load("flair/pos-english-fast")
36
+
37
+
38
+ if pickle:
39
+ self.pickle_models()
40
+
41
+ return self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger
42
+
43
+ def pickle_models(self):
44
+ self.pickle_it(self.ner, "ner")
45
+ self.pickle_it(self.zero_shot_classifier, "zero_shot_classifier_6")
46
+ self.pickle_it(self.ner_dates, "ner_dates")
47
+ self.pickle_it(self.tagger, "pos_tagger_fast")
48
+
49
+
50
+ def load_pickled_models(self):
51
+ ner_dates = self.unpickle_it('ner_dates')
52
+ ner = self.unpickle_it('ner')
53
+ zero_shot_classifier = self.unpickle_it('zero_shot_classifier_6')
54
+ tagger = self.unpickle_it("pos_tagger_fast")
55
+ return ner_dates, ner, zero_shot_classifier, tagger
56
+
57
+ def get_flair_sentence(self, sent):
58
+ return Sentence(sent)
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cv Parser
3
+ emoji: 💩
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
10
+ duplicated_from: asimokby/cv-parser-huggingface
11
+ ---
12
+
13
+ # Configuration
14
+
15
+ `title`: _string_
16
+ Display title for the Space
17
+
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio`, `streamlit`, or `static`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `models`: _List[string]_
39
+ HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
40
+ Will be parsed automatically from your code if not specified here.
41
+
42
+ `datasets`: _List[string]_
43
+ HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
44
+ Will be parsed automatically from your code if not specified here.
45
+
46
+ `pinned`: _boolean_
47
+ Whether the Space stays on top of your list.
ResumeParser.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+ from ResumeSegmenter import ResumeSegmenter
3
+ from datetime import datetime
4
+ from dateutil import parser
5
+ import re
6
+ from string import punctuation
7
+
8
+ class ResumeParser:
9
+ def __init__(self, ner, ner_dates, zero_shot_classifier, tagger):
10
+ self.models = Models()
11
+ self.segmenter = ResumeSegmenter(zero_shot_classifier)
12
+ self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger = ner, ner_dates, zero_shot_classifier, tagger
13
+ self.parsed_cv = {}
14
+
15
+ def parse(self, resume_lines):
16
+ resume_segments = self.segmenter.segment(resume_lines)
17
+ print("Parsing the Resume...")
18
+ for segment_name in resume_segments:
19
+ if segment_name == "contact_info":
20
+ contact_info = resume_segments[segment_name]
21
+ self.parse_contact_info(contact_info)
22
+ elif segment_name == "work_and_employment":
23
+ resume_segment = resume_segments[segment_name]
24
+ self.parse_job_history(resume_segment)
25
+ return self.parsed_cv
26
+
27
+
28
+ def parse_contact_info(self, contact_info):
29
+ contact_info_dict = {}
30
+ name = self.find_person_name(contact_info)
31
+ email = self.find_contact_email(contact_info)
32
+ self.parsed_cv['Name'] = name
33
+ contact_info_dict["Email"] = email
34
+ self.parsed_cv['Contact Info'] = contact_info_dict
35
+
36
+ def find_person_name(self, items):
37
+ class_score = []
38
+ splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
39
+ classes = ["person name", "address", "email", "title"]
40
+ for item in items:
41
+ elements = splitter.split(item)
42
+ for element in elements:
43
+ element = ''.join(i for i in element.strip() if not i.isdigit())
44
+ if not len(element.strip().split()) > 1: continue
45
+ out = self.zero_shot_classifier(element, classes)
46
+ highest = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1])[-1]
47
+ if highest[0] == "person name":
48
+ class_score.append((element, highest[1]))
49
+ if len(class_score):
50
+ return sorted(class_score, key=lambda x: x[1], reverse=True)[0][0]
51
+ return ""
52
+
53
+ def find_contact_email(self, items):
54
+ for item in items:
55
+ match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', item)
56
+ if match:
57
+ return match.group(0)
58
+ return ""
59
+
60
+ def parse_job_history(self, resume_segment):
61
+ idx_job_title = self.get_job_titles(resume_segment)
62
+ current_and_below = False
63
+ if not len(idx_job_title):
64
+ self.parsed_cv["Job History"] = []
65
+ return
66
+ if idx_job_title[0][0] == 0: current_and_below = True
67
+ job_history = []
68
+ for ls_idx, (idx, job_title) in enumerate(idx_job_title):
69
+ job_info = {}
70
+ job_info["Job Title"] = self.filter_job_title(job_title)
71
+ # company
72
+ if current_and_below: line1, line2 = idx, idx+1
73
+ else: line1, line2 = idx, idx-1
74
+ job_info["Company"] = self.get_job_company(line1, line2, resume_segment)
75
+ if current_and_below: st_span = idx
76
+ else: st_span = idx-1
77
+ # Dates
78
+ if ls_idx == len(idx_job_title) - 1: end_span = len(resume_segment)
79
+ else: end_span = idx_job_title[ls_idx+1][0]
80
+ start, end = self.get_job_dates(st_span, end_span, resume_segment)
81
+ job_info["Start Date"] = start
82
+ job_info["End Date"] = end
83
+ job_history.append(job_info)
84
+ self.parsed_cv["Job History"] = job_history
85
+
86
+ def get_job_titles(self, resume_segment):
87
+ classes = ["organization", "institution", "company", "job title", "work details"]
88
+ idx_line = []
89
+ for idx, line in enumerate(resume_segment):
90
+ has_verb = False
91
+ line_modifed = ''.join(i for i in line if not i.isdigit())
92
+ sentence = self.models.get_flair_sentence(line_modifed)
93
+ self.tagger.predict(sentence)
94
+ tags = []
95
+ for entity in sentence.get_spans('pos'):
96
+ tags.append(entity.tag)
97
+ if entity.tag.startswith("V"):
98
+ has_verb = True
99
+
100
+ most_common_tag = max(set(tags), key=tags.count)
101
+ if most_common_tag == "NNP":
102
+ if not has_verb:
103
+ out = self.zero_shot_classifier(line, classes)
104
+ class_score = zip(out["labels"], out["scores"])
105
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
106
+
107
+ if highest[0] == "job title":
108
+ idx_line.append((idx, line))
109
+
110
+ return idx_line
111
+
112
+ def get_job_dates(self, st, end, resume_segment):
113
+ search_span = resume_segment[st:end]
114
+ dates = []
115
+ for line in search_span:
116
+ for dt in self.get_ner_in_line(line, "DATE"):
117
+ if self.isvalidyear(dt.strip()):
118
+ dates.append(dt)
119
+ if len(dates): first = dates[0]
120
+ exists_second = False
121
+ if len(dates) > 1:
122
+ exists_second = True
123
+ second = dates[1]
124
+
125
+ if len(dates) > 0:
126
+ if self.has_two_dates(first):
127
+ d1, d2 = self.get_two_dates(first)
128
+ return self.format_date(d1), self.format_date(d2)
129
+ elif exists_second and self.has_two_dates(second):
130
+ d1, d2 = self.get_two_dates(second)
131
+ return self.format_date(d1), self.format_date(d2)
132
+ else:
133
+ if exists_second:
134
+ st = self.format_date(first)
135
+ end = self.format_date(second)
136
+ return st, end
137
+ else:
138
+ return (self.format_date(first), "")
139
+ else: return ("", "")
140
+
141
+
142
+
143
+ def filter_job_title(self, job_title):
144
+ job_title_splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
145
+ job_title = ''.join(i for i in job_title if not i.isdigit())
146
+ tokens = job_title_splitter.split(job_title)
147
+ tokens = [''.join([i for i in tok.strip() if (i.isalpha() or i.strip()=="")]) for tok in tokens if tok.strip()]
148
+ classes = ["company", "organization", "institution", "job title", "responsibility", "details"]
149
+ new_title = []
150
+ for token in tokens:
151
+ if not token: continue
152
+ res = self.zero_shot_classifier(token, classes)
153
+ class_score = zip(res["labels"], res["scores"])
154
+ highest = sorted(class_score, key=lambda x: x[1])[-1]
155
+ if highest[0] == "job title":
156
+ new_title.append(token.strip())
157
+ if len(new_title):
158
+ return ', '.join(new_title)
159
+ else: return ', '.join(tokens)
160
+
161
+ def has_two_dates(self, date):
162
+ years = self.get_valid_years()
163
+ count = 0
164
+ for year in years:
165
+ if year in str(date):
166
+ count+=1
167
+ return count == 2
168
+
169
+ def get_two_dates(self, date):
170
+ years = self.get_valid_years()
171
+ idxs = []
172
+ for year in years:
173
+ if year in date:
174
+ idxs.append(date.index(year))
175
+ min_idx = min(idxs)
176
+ first = date[:min_idx+4]
177
+ second = date[min_idx+4:]
178
+ return first, second
179
+ def get_valid_years(self):
180
+ current_year = datetime.today().year
181
+ years = [str(i) for i in range(current_year-100, current_year)]
182
+ return years
183
+
184
+ def format_date(self, date):
185
+ out = self.parse_date(date)
186
+ if out:
187
+ return out
188
+ else:
189
+ date = self.clean_date(date)
190
+ out = self.parse_date(date)
191
+ if out:
192
+ return out
193
+ else:
194
+ return date
195
+
196
+ def clean_date(self, date):
197
+ try:
198
+ date = ''.join(i for i in date if i.isalnum() or i =='-' or i == '/')
199
+ return date
200
+ except:
201
+ return date
202
+
203
+ def parse_date(self, date):
204
+ try:
205
+ date = parser.parse(date)
206
+ return date.strftime("%m-%Y")
207
+ except:
208
+ try:
209
+ date = datetime(date)
210
+ return date.strftime("%m-%Y")
211
+ except:
212
+ return 0
213
+
214
+
215
+ def isvalidyear(self, date):
216
+ current_year = datetime.today().year
217
+ years = [str(i) for i in range(current_year-100, current_year)]
218
+ for year in years:
219
+ if year in str(date):
220
+ return True
221
+ return False
222
+
223
+ def get_ner_in_line(self, line, entity_type):
224
+ if entity_type == "DATE": ner = self.ner_dates
225
+ else: ner = self.ner
226
+ return [i['word'] for i in ner(line) if i['entity_group'] == entity_type]
227
+
228
+
229
+ def get_job_company(self, idx, idx1, resume_segment):
230
+ job_title = resume_segment[idx]
231
+ if not idx1 <= len(resume_segment)-1: context = ""
232
+ else:context = resume_segment[idx1]
233
+ candidate_companies = self.get_ner_in_line(job_title, "ORG") + self.get_ner_in_line(context, "ORG")
234
+ classes = ["organization", "company", "institution", "not organization", "not company", "not institution"]
235
+ scores = []
236
+ for comp in candidate_companies:
237
+ res = self.zero_shot_classifier(comp, classes)['scores']
238
+ scores.append(max(res[:3]))
239
+ sorted_cmps = sorted(zip(candidate_companies, scores), key=lambda x: x[1], reverse=True)
240
+ if len(sorted_cmps): return sorted_cmps[0][0]
241
+ return context
ResumeReader.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ import pdfplumber
5
+
6
+ class ResumeReader:
7
+
8
+ def convert_docx_to_txt(self, docx_file,docx_parser):
9
+ """
10
+ A utility function to convert a Microsoft docx files to raw text.
11
+
12
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
13
+ :param docx_file: docx file with gets uploaded by the user
14
+ :type docx_file: InMemoryUploadedFile
15
+ :return: The text contents of the docx file
16
+ :rtype: str
17
+ """
18
+
19
+ # doc = docx.Document(docx_file)
20
+ # allText = []
21
+ # for docpara in doc.paragraphs:
22
+ # allText.append(docpara.text)
23
+ # text = ' '.join(allText)
24
+ text = ""
25
+ try:
26
+ clean_text = re.sub(r'\n+', '\n', text)
27
+ clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
28
+ resume_lines = clean_text.splitlines() # Split text blob into individual lines
29
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
30
+ line.strip()] # Remove empty strings and whitespaces
31
+ return resume_lines, text
32
+ except Exception as e:
33
+ logging.error('Error in docx file:: ' + str(e))
34
+ return [], " "
35
+
36
+ def convert_pdf_to_txt(self, pdf_file):
37
+ """
38
+ A utility function to convert a machine-readable PDF to raw text.
39
+
40
+ This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
41
+ :param input_pdf_path: Path to the .pdf file which should be converted
42
+ :type input_pdf_path: str
43
+ :return: The text contents of the pdf
44
+ :rtype: str
45
+ """
46
+
47
+ pdf = pdfplumber.open(pdf_file)
48
+ raw_text= ""
49
+
50
+ for page in pdf.pages:
51
+ raw_text += page.extract_text() + "\n"
52
+
53
+ pdf.close()
54
+
55
+ try:
56
+ full_string = re.sub(r'\n+', '\n', raw_text)
57
+ full_string = full_string.replace("\r", "\n")
58
+ full_string = full_string.replace("\t", " ")
59
+
60
+ # Remove awkward LaTeX bullet characters
61
+ full_string = re.sub(r"\uf0b7", " ", full_string)
62
+ full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
63
+ full_string = re.sub(r'• ', " ", full_string)
64
+
65
+ # Split text blob into individual lines
66
+ resume_lines = full_string.splitlines(True)
67
+
68
+ # Remove empty strings and whitespaces
69
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
70
+
71
+ return resume_lines, raw_text
72
+ except Exception as e:
73
+ logging.error('Error in docx file:: ' + str(e))
74
+ return [], " "
75
+
76
+ def read_file(self, file,docx_parser = "tika"):
77
+ """
78
+ file : Give path of resume file
79
+ docx_parser : Enter docx2txt or tika, by default is tika
80
+ """
81
+ print("Reading the Resume...")
82
+ # file = "/content/Asst Manager Trust Administration.docx"
83
+ file = os.path.join(file)
84
+ if file.endswith('docx') or file.endswith('doc'):
85
+ # if file.endswith('doc') and docx_parser == "docx2txt":
86
+ # docx_parser = "tika"
87
+ # logging.error("doc format not supported by the docx2txt changing back to tika")
88
+ resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
89
+ elif file.endswith('pdf'):
90
+ resume_lines, raw_text = self.convert_pdf_to_txt(file)
91
+ elif file.endswith('txt'):
92
+ with open(file, 'r', encoding='utf-8') as f:
93
+ resume_lines = f.readlines()
94
+
95
+ else:
96
+ resume_lines = None
97
+
98
+
99
+ return resume_lines
ResumeSegmenter.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Models import Models
2
+
3
+ class ResumeSegmenter:
4
+
5
+ def __init__(self, zero_shot_classifier):
6
+ self.zero_shot_classifier = zero_shot_classifier
7
+
8
+ objective = (
9
+ 'career goal',
10
+ 'objective',
11
+ 'career objective',
12
+ 'employment objective',
13
+ 'professional objective',
14
+ 'summary',
15
+ 'summary of qualifications',
16
+ 'digital'
17
+ )
18
+
19
+ work_and_employment = (
20
+ 'employment history',
21
+ 'employment data',
22
+ 'career summary',
23
+ 'work history',
24
+ 'work experience',
25
+ 'experience',
26
+ 'professional experience',
27
+ 'professional background',
28
+ 'professional employment',
29
+ 'additional experience',
30
+ 'career related experience',
31
+ "professional employment history",
32
+ 'related experience',
33
+ 'programming experience',
34
+ 'freelance',
35
+ 'freelance experience',
36
+ 'army experience',
37
+ 'military experience',
38
+ 'military background',
39
+ )
40
+
41
+ education_and_training = (
42
+ 'academic background',
43
+ 'academic experience',
44
+ 'programs',
45
+ 'courses',
46
+ 'related courses',
47
+ 'education',
48
+ 'educational background',
49
+ 'educational qualifications',
50
+ 'educational training',
51
+ 'education and training',
52
+ 'training',
53
+ 'academic training',
54
+ 'professional training',
55
+ 'course project experience',
56
+ 'related course projects',
57
+ 'internship experience',
58
+ 'internships',
59
+ 'apprenticeships',
60
+ 'college activities',
61
+ 'certifications',
62
+ 'special training',
63
+ )
64
+
65
+ skills_header = (
66
+ 'credentials',
67
+ 'qualifications',
68
+ 'areas of experience',
69
+ 'areas of expertise',
70
+ 'areas of knowledge',
71
+ 'skills',
72
+ "other skills",
73
+ "other abilities",
74
+ 'career related skills',
75
+ 'professional skills',
76
+ 'specialized skills',
77
+ 'technical skills',
78
+ 'computer skills',
79
+ 'personal skills',
80
+ 'computer knowledge',
81
+ 'technologies',
82
+ 'technical experience',
83
+ 'proficiencies',
84
+ 'languages',
85
+ 'language competencies and skills',
86
+ 'programming languages',
87
+ 'competencies'
88
+ )
89
+
90
+ misc = (
91
+ 'activities and honors',
92
+ 'activities',
93
+ 'affiliations',
94
+ 'professional affiliations',
95
+ 'associations',
96
+ 'professional associations',
97
+ 'memberships',
98
+ 'professional memberships',
99
+ 'athletic involvement',
100
+ 'community involvement',
101
+ 'refere',
102
+ 'civic activities',
103
+ 'extra-Curricular activities',
104
+ 'professional activities',
105
+ 'volunteer work',
106
+ 'volunteer experience',
107
+ 'additional information',
108
+ 'interests'
109
+ )
110
+
111
+ accomplishments = (
112
+ 'achievement',
113
+ 'awards and achievements',
114
+ 'licenses',
115
+ 'presentations',
116
+ 'conference presentations',
117
+ 'conventions',
118
+ 'dissertations',
119
+ 'exhibits',
120
+ 'papers',
121
+ 'publications',
122
+ 'professional publications',
123
+ 'research experience',
124
+ 'research grants',
125
+ 'project',
126
+ 'research projects',
127
+ 'personal projects',
128
+ 'current research interests',
129
+ 'thesis',
130
+ 'theses',
131
+ )
132
+
133
+
134
+ def find_segment_indices(self, string_to_search, resume_segments, resume_indices):
135
+ for i, line in enumerate(string_to_search):
136
+
137
+ if line[0].islower():
138
+ continue
139
+
140
+ header = line.lower()
141
+
142
+ if [o for o in self.objective if header.startswith(o)]:
143
+ try:
144
+ resume_segments['objective'][header]
145
+ except:
146
+ resume_indices.append(i)
147
+ header = [o for o in self.objective if header.startswith(o)][0]
148
+ resume_segments['objective'][header] = i
149
+ elif [w for w in self.work_and_employment if header.startswith(w)]:
150
+ try:
151
+ resume_segments['work_and_employment'][header]
152
+ except:
153
+ resume_indices.append(i)
154
+ header = [w for w in self.work_and_employment if header.startswith(w)][0]
155
+ resume_segments['work_and_employment'][header] = i
156
+ elif [e for e in self.education_and_training if header.startswith(e)]:
157
+ try:
158
+ resume_segments['education_and_training'][header]
159
+ except:
160
+ resume_indices.append(i)
161
+ header = [e for e in self.education_and_training if header.startswith(e)][0]
162
+ resume_segments['education_and_training'][header] = i
163
+ elif [s for s in self.skills_header if header.startswith(s)]:
164
+ try:
165
+ resume_segments['skills'][header]
166
+ except:
167
+ resume_indices.append(i)
168
+ header = [s for s in self.skills_header if header.startswith(s)][0]
169
+ resume_segments['skills'][header] = i
170
+ elif [m for m in self.misc if header.startswith(m)]:
171
+ try:
172
+ resume_segments['misc'][header]
173
+ except:
174
+ resume_indices.append(i)
175
+ header = [m for m in self.misc if header.startswith(m)][0]
176
+ resume_segments['misc'][header] = i
177
+ elif [a for a in self.accomplishments if header.startswith(a)]:
178
+ try:
179
+ resume_segments['accomplishments'][header]
180
+ except:
181
+ resume_indices.append(i)
182
+ header = [a for a in self.accomplishments if header.startswith(a)][0]
183
+ resume_segments['accomplishments'][header] = i
184
+
185
+ def slice_segments(self, string_to_search, resume_segments, resume_indices):
186
+ resume_segments['contact_info'] = string_to_search[:resume_indices[0]]
187
+ sec_idxs = {}
188
+ for section, value in resume_segments.items():
189
+ if section == 'contact_info':
190
+ continue
191
+
192
+ for sub_section, start_idx in value.items():
193
+ end_idx = len(string_to_search)
194
+ if (resume_indices.index(start_idx) + 1) != len(resume_indices):
195
+ end_idx = resume_indices[resume_indices.index(start_idx) + 1]
196
+
197
+ sec_idxs[section] = (start_idx, end_idx)
198
+ # print(start_idx, end_idx)
199
+
200
+ resume_segments[section][sub_section] = string_to_search[start_idx:end_idx]
201
+ return sec_idxs
202
+
203
+ def find_true_segment(self, dict_of_segments, segment_name):
204
+ segment_classes = {
205
+ 'objective': ["objective", "other"],
206
+ 'work_and_employment':["employment history", "other"],
207
+ 'education_and_training': ["education", "other"],
208
+ 'skills': ["skills", "other"],
209
+ 'accomplishments': ["accomplishments", "other"],
210
+ 'misc': ["misc", "other"],
211
+ 'contact_info': ["contact information", "other"]
212
+ }
213
+ classes = segment_classes[segment_name]
214
+ scores = []
215
+ segs = dict_of_segments.keys()
216
+ for seg in segs:
217
+ sequence = dict_of_segments[seg]
218
+ score = self.zero_shot_classifier(' '.join(sequence), classes)["scores"][0]
219
+ scores.append(score)
220
+
221
+ res = sorted(zip(dict_of_segments.keys(), scores), key=lambda x: x[1], reverse=True)
222
+ if len(res):
223
+ return res[0][0]
224
+ else: return 0
225
+
226
+ def segment(self, string_to_search):
227
+ print("Segmenting the Resume..")
228
+ resume_segments = {
229
+ 'objective': {},
230
+ 'work_and_employment': {},
231
+ 'education_and_training': {},
232
+ 'skills': {},
233
+ 'accomplishments': {},
234
+ 'misc': {}
235
+ }
236
+
237
+ resume_indices = []
238
+
239
+ self.find_segment_indices(string_to_search, resume_segments, resume_indices)
240
+ if len(resume_indices) != 0:
241
+ sec_idx = self.slice_segments(string_to_search, resume_segments, resume_indices)
242
+ else:
243
+ resume_segments['contact_info'] = []
244
+
245
+ for segment in resume_segments:
246
+ if segment == "contact_info": continue
247
+ if not len(resume_segments[segment]) > 1:
248
+ if len(resume_segments[segment]) == 1:
249
+ only_key = list(resume_segments[segment].keys())[0]
250
+ resume_segments[segment] = resume_segments[segment][only_key][1:]
251
+ continue
252
+ if segment != "work_and_employment": continue
253
+ true_seg = self.find_true_segment(resume_segments[segment], segment)
254
+ if not true_seg:
255
+ resume_segments[segment] = []
256
+ else:
257
+ resume_segments[segment] = resume_segments[segment][true_seg][1:]
258
+
259
+ return resume_segments
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydoc import describe
2
+ import gradio as gr
3
+ from Main import Main
4
+
5
+
6
+ main = Main()
7
+
8
+ def parse_cv(cv):
9
+ return main.parse_cv(cv.name)
10
+
11
+
12
+ description = """A demo for a CV parser built with HuggingFace's transformers."""
13
+ article = "Find the code on GitHub 🚀: https://github.com/asimokby/cv-parser-huggingface"
14
+ file_input = gr.inputs.File(file_count="single", type="file", label="Upload a CV: .PDF Or .TXT", optional=False)
15
+ iface = gr.Interface(fn=parse_cv, inputs=file_input, outputs="json", allow_flagging="never",
16
+ allow_screenshot=False, title="CV Parser", theme="dark", description=description, article=article)
17
+
18
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.1
2
+ aiosignal==1.2.0
3
+ analytics-python==1.4.0
4
+ anyio==3.5.0
5
+ asgiref==3.5.0
6
+ async-timeout==4.0.2
7
+ attrs==21.4.0
8
+ backoff==1.10.0
9
+ bcrypt==3.2.0
10
+ bpemb==0.3.3
11
+ certifi==2021.10.8
12
+ cffi==1.15.0
13
+ chardet==4.0.0
14
+ charset-normalizer==2.0.11
15
+ click==8.0.3
16
+ colorama==0.4.4
17
+ coloredlogs==15.0.1
18
+ conllu==4.4.1
19
+ cryptography==36.0.1
20
+ cycler==0.11.0
21
+ Cython==0.29.23
22
+ Deprecated==1.2.13
23
+ doc2text==0.2.4
24
+ fastapi==0.73.0
25
+ ffmpy==0.3.0
26
+ filelock==3.4.2
27
+ flair==0.10
28
+ flatbuffers==2.0
29
+ fonttools==4.29.1
30
+ frozenlist==1.3.0
31
+ ftfy==6.0.3
32
+ future==0.18.2
33
+ gdown==3.12.2
34
+ gensim==4.1.2
35
+ gradio==2.7.5.2
36
+ h11==0.13.0
37
+ huggingface-hub==0.4.0
38
+ humanfriendly==10.0
39
+ idna==3.3
40
+ importlib-metadata==3.10.1
41
+ Janome==0.4.1
42
+ Jinja2==3.0.3
43
+ joblib==1.1.0
44
+ kiwisolver==1.3.2
45
+ konoha==4.6.5
46
+ langdetect==1.0.9
47
+ markdown2==2.4.2
48
+ MarkupSafe==2.0.1
49
+ matplotlib==3.5.1
50
+ mime==0.1.0
51
+ monotonic==1.6
52
+ more-itertools==8.8.0
53
+ mpld3==0.3
54
+ multidict==6.0.2
55
+ numpy==1.22.1
56
+ overrides==3.1.0
57
+ packaging==21.3
58
+ pandas==1.4.0
59
+ paramiko==2.9.2
60
+ pdfminer.six==20211012
61
+ pdfplumber==0.6.0
62
+ Pillow==9.0.1
63
+ protobuf==3.19.4
64
+ psutil==5.9.0
65
+ py-cpuinfo==8.0.0
66
+ py3nvml==0.2.7
67
+ pycparser==2.21
68
+ pycryptodome==3.14.1
69
+ pydantic==1.9.0
70
+ pydub==0.25.1
71
+ PyNaCl==1.5.0
72
+ pyparsing==3.0.7
73
+ PyPDF2==1.26.0
74
+ pyreadline3==3.4.1
75
+ PySocks==1.7.1
76
+ pytesseract==0.3.8
77
+ python-dateutil==2.8.2
78
+ python-multipart==0.0.5
79
+ pytz==2021.3
80
+ PyYAML==6.0
81
+ regex==2022.1.18
82
+ requests==2.27.1
83
+ sacremoses==0.0.47
84
+ scikit-learn==1.0.2
85
+ scipy==1.7.3
86
+ segtok==1.5.11
87
+ sentencepiece==0.1.95
88
+ six==1.16.0
89
+ smart-open==5.2.1
90
+ sniffio==1.2.0
91
+ sqlitedict==1.7.0
92
+ starlette==0.17.1
93
+ tabulate==0.8.9
94
+ threadpoolctl==3.1.0
95
+ tokenizers==0.10.3
96
+ torch==1.10.2
97
+ tqdm==4.62.3
98
+ transformers==4.15.0
99
+ typing_extensions==4.0.1
100
+ urllib3==1.26.8
101
+ uvicorn==0.17.4
102
+ Wand==0.6.7
103
+ wcwidth==0.2.5
104
+ Wikipedia-API==0.5.4
105
+ wincertstore==0.2
106
+ wrapt==1.13.3
107
+ xmltodict==0.12.0
108
+ yarl==1.7.2
109
+ zipp==3.7.0