bishalbose294 commited on
Commit
775f69c
·
1 Parent(s): a7e7c48

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *env/
2
+ *pycache*/
3
+ *test_data/
4
+ uploads/*
5
+ test_file.py
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.11
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ COPY ./packages.txt /code/packages.txt
8
+
9
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
+
11
+ COPY . .
12
+
13
+ ENV TRANSFORMERS_CACHE=/code/hf_model
14
+ ENV HF_HOME=/code/hf_model
15
+ ENV HF_DATASETS_CACHE=/code/hf_model
16
+ ENV XDG_CACHE_HOME=/code/hf_model
17
+
18
+ RUN chmod -R 777 .
19
+
20
+ EXPOSE 7860
21
+
22
+ CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1 @@
1
- ---
2
- title: TalentScoutAI
3
- emoji: 🌖
4
- colorFrom: indigo
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- license: unknown
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Talent-Scout-AI
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, redirect, url_for, render_template, request, jsonify
2
+ from flask_cors import CORS
3
+ import simplejson as json
4
+ import os, time, traceback
5
+ import shutil
6
+ from src.mains.candidate_job_match import MatchJobCandidate
7
+ from src.mains.resume_analyzer import ResumeAnalyzer
8
+ from gevent.pywsgi import WSGIServer
9
+
10
+ app = Flask(__name__)
11
+ CORS(app=app)
12
+
13
+ cwd = os.getcwd()
14
+ app.config["ALLOWED_EXTENSIONS"] = [".pdf"]
15
+ app.config["MAX_CONTENT_LENGTH"] = 25 * 1024 * 1024 # 25 MB
16
+ app.config["UPLOAD_FOLDER"] = os.path.join(cwd, "uploads")
17
+
18
+ methods = ['GET','POST']
19
+
20
+ def home():
21
+ return render_template('index.html')
22
+
23
+ app.add_url_rule('/', 'home', home, methods=methods)
24
+
25
+ def calculate_scores():
26
+ try:
27
+ timestr = time.strftime("%Y%m%d_%H%M%S")
28
+ jds_folder = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"jds")
29
+ os.makedirs(jds_folder)
30
+ res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
31
+ os.makedirs(res_foler)
32
+
33
+
34
+ jdfiles = request.files.getlist("jdfiles")
35
+ for file in jdfiles:
36
+ filePath = os.path.join(jds_folder, file.filename)
37
+ file.save(filePath)
38
+
39
+ resumefiles = request.files.getlist("resfiles")
40
+ for file in resumefiles:
41
+ filePath = os.path.join(res_foler, file.filename)
42
+ file.save(filePath)
43
+
44
+ match = MatchJobCandidate()
45
+ pointers = match.generatePointers(jds_folder, res_foler)
46
+ keywords = match.extractJDResumeKeywords(jds_folder, res_foler)
47
+
48
+ final_dict = dict()
49
+
50
+ for jd, resumePointers in pointers.items():
51
+ temp_dict = dict()
52
+ for resume, points in resumePointers.items():
53
+ temp_dict[resume] = {
54
+ 'points' : points,
55
+ 'keywords' : keywords[jd][resume],
56
+ }
57
+ final_dict[jd] = temp_dict
58
+
59
+ return json.dumps(final_dict)
60
+
61
+ except Exception as ex:
62
+ print("Exception: ",ex.with_traceback)
63
+ print(traceback.format_exc())
64
+ return jsonify({"error": str(ex)})
65
+ finally:
66
+ shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
67
+
68
+ app.add_url_rule("/calculate_scores", 'calculate_scores', calculate_scores, methods=methods)
69
+
70
+ def summarize_resume():
71
+ try:
72
+ timestr = time.strftime("%Y%m%d_%H%M%S")
73
+
74
+ res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
75
+ os.makedirs(res_foler)
76
+
77
+ resumefiles = request.files.getlist("resfiles")
78
+ for file in resumefiles:
79
+ filePath = os.path.join(res_foler, file.filename)
80
+ file.save(filePath)
81
+
82
+ resumeAnalyze = ResumeAnalyzer()
83
+ response = resumeAnalyze.resumeBatchSummarizer(res_foler)
84
+
85
+ return json.dumps(response)
86
+
87
+ except Exception as ex:
88
+ print("Exception: ",ex.with_traceback)
89
+ print(traceback.format_exc())
90
+ return jsonify({"error": str(ex)})
91
+ finally:
92
+ shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
93
+ pass
94
+
95
+ app.add_url_rule("/summarize_resume", 'summarize_resume', summarize_resume, methods=methods)
96
+
97
+ if __name__ == '__main__':
98
+ host = '0.0.0.0'
99
+ port = 7860
100
+ print("#"*50,"--Application Serving Now--","#"*50)
101
+ # app.run(host=host,port=port)
102
+ app_serve = WSGIServer((host,port),app)
103
+ app_serve.serve_forever()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pymupdf
2
+ Flask==2.3.2
3
+ Flask_Cors==4.0.0
4
+ nltk==3.8.1
5
+ protobuf==3.19.3
6
+ semantic_text_splitter==0.13.1
7
+ sentence_transformers==2.2.2
8
+ simplejson==3.19.1
9
+ transformers
10
+ gevent
src/configs/abbr.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ain't": "is not",
3
+ "aren't": "are not",
4
+ "can't": "cannot",
5
+ "'cause": "because",
6
+ "could've": "could have",
7
+ "couldn't": "could not",
8
+ "didn't": "did not",
9
+ "doesn't": "does not",
10
+ "don't": "do not",
11
+ "hadn't": "had not",
12
+ "hasn't": "has not",
13
+ "haven't": "have not",
14
+ "he'd": "he would",
15
+ "he'll": "he will",
16
+ "he's": "he is",
17
+ "how'd": "how did",
18
+ "how'd'y": "how do you",
19
+ "how'll": "how will",
20
+ "how's": "how is",
21
+ "I'd": "I would",
22
+ "I'd've": "I would have",
23
+ "I'll": "I will",
24
+ "I'll've": "I will have",
25
+ "I'm": "I am",
26
+ "I've": "I have",
27
+ "i'd": "i would",
28
+ "i'd've": "i would have",
29
+ "i'll": "i will",
30
+ "i'll've": "i will have",
31
+ "i'm": "i am",
32
+ "i've": "i have",
33
+ "isn't": "is not",
34
+ "it'd": "it would",
35
+ "it'd've": "it would have",
36
+ "it'll": "it will",
37
+ "it'll've": "it will have",
38
+ "it's": "it is",
39
+ "let's": "let us",
40
+ "ma'am": "madam",
41
+ "mayn't": "may not",
42
+ "might've": "might have",
43
+ "mightn't": "might not",
44
+ "mightn't've": "might not have",
45
+ "must've": "must have",
46
+ "mustn't": "must not",
47
+ "mustn't've": "must not have",
48
+ "needn't": "need not",
49
+ "needn't've": "need not have",
50
+ "o'clock": "of the clock",
51
+ "oughtn't": "ought not",
52
+ "oughtn't've": "ought not have",
53
+ "shan't": "shall not",
54
+ "sha'n't": "shall not",
55
+ "shan't've": "shall not have",
56
+ "she'd": "she would",
57
+ "she'd've": "she would have",
58
+ "she'll": "she will",
59
+ "she'll've": "she will have",
60
+ "she's": "she is",
61
+ "should've": "should have",
62
+ "shouldn't": "should not",
63
+ "shouldn't've": "should not have",
64
+ "so've": "so have",
65
+ "so's": "so as",
66
+ "this's": "this is",
67
+ "that'd": "that would",
68
+ "that'd've": "that would have",
69
+ "that's": "that is",
70
+ "there'd": "there would",
71
+ "there'd've": "there would have",
72
+ "there's": "there is",
73
+ "here's": "here is",
74
+ "they'd": "they would",
75
+ "they'd've": "they would have",
76
+ "they'll": "they will",
77
+ "they'll've": "they will have",
78
+ "they're": "they are",
79
+ "they've": "they have",
80
+ "to've": "to have",
81
+ "wasn't": "was not",
82
+ "we'd": "we would",
83
+ "we'd've": "we would have",
84
+ "we'll": "we will",
85
+ "we'll've": "we will have",
86
+ "we're": "we are",
87
+ "we've": "we have",
88
+ "weren't": "were not",
89
+ "what'll": "what will",
90
+ "what'll've": "what will have",
91
+ "what're": "what are",
92
+ "what's": "what is",
93
+ "what've": "what have",
94
+ "when's": "when is",
95
+ "when've": "when have",
96
+ "where'd": "where did",
97
+ "where's": "where is",
98
+ "where've": "where have",
99
+ "who'll": "who will",
100
+ "who'll've": "who will have",
101
+ "who's": "who is",
102
+ "who've": "who have",
103
+ "why's": "why is",
104
+ "why've": "why have",
105
+ "will've": "will have",
106
+ "won't": "will not",
107
+ "won't've": "will not have",
108
+ "would've": "would have",
109
+ "wouldn't": "would not",
110
+ "wouldn't've": "would not have",
111
+ "y'all": "you all",
112
+ "y'all'd": "you all would",
113
+ "y'all'd've": "you all would have",
114
+ "y'all're": "you all are",
115
+ "y'all've": "you all have",
116
+ "you'd": "you would",
117
+ "you'd've": "you would have",
118
+ "you'll": "you will",
119
+ "you'll've": "you will have",
120
+ "you're": "you are",
121
+ "you've": "you have"
122
+ }
src/configs/config.cfg ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [EMBEDDINGS]
2
+ SENTENCE_TRANSFORMER=nomic-ai/nomic-embed-text-v1.5
3
+ KEYWORD_EXTRACTOR=ml6team/keyphrase-extraction-distilbert-inspec
4
+ SCORING_EMBED=sentence-transformers/all-MiniLM-L6-v2
5
+
6
+ [CHUNKING]
7
+ CHUNK_SIZE=1000
8
+ CHUNK_OVERLAP=100
9
+
10
+ [ANALYZER]
11
+ TOP_KEYWORDS=20
12
+ MAX_KEYWORDS_SIZE=3
13
+ KEYWORD_MATCH_THRESHOLD=0.75
14
+ RESUME_SUMMARIZER=facebook/bart-large-cnn
15
+ RESUME_MAXLENGTH=150
16
+ RESUME_MINLENGTH=50
17
+
18
+ [CANDIDATE]
19
+ RESUME_MATCH_POINT_THRESHOLD=2
20
+ SECTION_MATCH_POINT_THRESHOLD=0.4
src/configs/stopwords.txt ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ able
3
+ about
4
+ above
5
+ abst
6
+ accordance
7
+ according
8
+ accordingly
9
+ across
10
+ act
11
+ actually
12
+ added
13
+ adj
14
+ affected
15
+ affecting
16
+ affects
17
+ after
18
+ afterwards
19
+ again
20
+ against
21
+ ah
22
+ ain't
23
+ all
24
+ allow
25
+ allows
26
+ almost
27
+ alone
28
+ along
29
+ already
30
+ also
31
+ although
32
+ always
33
+ am
34
+ among
35
+ amongst
36
+ an
37
+ and
38
+ announce
39
+ another
40
+ any
41
+ anybody
42
+ anyhow
43
+ anymore
44
+ anyone
45
+ anything
46
+ anyway
47
+ anyways
48
+ anywhere
49
+ apart
50
+ apparently
51
+ appear
52
+ appreciate
53
+ appropriate
54
+ approximately
55
+ are
56
+ aren
57
+ arent
58
+ aren't
59
+ arise
60
+ around
61
+ as
62
+ a's
63
+ aside
64
+ ask
65
+ asking
66
+ associated
67
+ at
68
+ auth
69
+ available
70
+ away
71
+ awfully
72
+ b
73
+ back
74
+ be
75
+ became
76
+ because
77
+ become
78
+ becomes
79
+ becoming
80
+ been
81
+ before
82
+ beforehand
83
+ begin
84
+ beginning
85
+ beginnings
86
+ begins
87
+ behind
88
+ being
89
+ believe
90
+ below
91
+ beside
92
+ besides
93
+ best
94
+ better
95
+ between
96
+ beyond
97
+ biol
98
+ both
99
+ brief
100
+ briefly
101
+ but
102
+ by
103
+ c
104
+ ca
105
+ came
106
+ can
107
+ cannot
108
+ cant
109
+ can't
110
+ cause
111
+ causes
112
+ certain
113
+ certainly
114
+ changes
115
+ clearly
116
+ c'mon
117
+ co
118
+ com
119
+ come
120
+ comes
121
+ concerning
122
+ consequently
123
+ consider
124
+ considering
125
+ contain
126
+ containing
127
+ contains
128
+ corresponding
129
+ could
130
+ couldnt
131
+ couldn't
132
+ course
133
+ c's
134
+ currently
135
+ d
136
+ date
137
+ definitely
138
+ described
139
+ despite
140
+ did
141
+ didn't
142
+ different
143
+ do
144
+ does
145
+ doesn't
146
+ doing
147
+ done
148
+ don't
149
+ down
150
+ downwards
151
+ due
152
+ during
153
+ e
154
+ each
155
+ ed
156
+ edu
157
+ effect
158
+ eg
159
+ eight
160
+ eighty
161
+ either
162
+ else
163
+ elsewhere
164
+ end
165
+ ending
166
+ enough
167
+ entirely
168
+ especially
169
+ et
170
+ et-al
171
+ etc
172
+ even
173
+ ever
174
+ every
175
+ everybody
176
+ everyone
177
+ everything
178
+ everywhere
179
+ ex
180
+ exactly
181
+ example
182
+ except
183
+ f
184
+ far
185
+ few
186
+ ff
187
+ fifth
188
+ first
189
+ five
190
+ fix
191
+ followed
192
+ following
193
+ follows
194
+ for
195
+ former
196
+ formerly
197
+ forth
198
+ found
199
+ four
200
+ from
201
+ further
202
+ furthermore
203
+ g
204
+ gave
205
+ get
206
+ gets
207
+ getting
208
+ give
209
+ given
210
+ gives
211
+ giving
212
+ go
213
+ goes
214
+ going
215
+ gone
216
+ got
217
+ gotten
218
+ greetings
219
+ h
220
+ had
221
+ hadn't
222
+ happens
223
+ hardly
224
+ has
225
+ hasn't
226
+ have
227
+ haven't
228
+ having
229
+ he
230
+ hed
231
+ he'd
232
+ he'll
233
+ hello
234
+ help
235
+ hence
236
+ her
237
+ here
238
+ hereafter
239
+ hereby
240
+ herein
241
+ heres
242
+ here's
243
+ hereupon
244
+ hers
245
+ herself
246
+ hes
247
+ he's
248
+ hi
249
+ hid
250
+ him
251
+ himself
252
+ his
253
+ hither
254
+ home
255
+ hopefully
256
+ how
257
+ howbeit
258
+ however
259
+ how's
260
+ hundred
261
+ i
262
+ id
263
+ i'd
264
+ ie
265
+ if
266
+ ignored
267
+ i'll
268
+ im
269
+ i'm
270
+ immediate
271
+ immediately
272
+ importance
273
+ important
274
+ in
275
+ inasmuch
276
+ inc
277
+ indeed
278
+ index
279
+ indicate
280
+ indicated
281
+ indicates
282
+ information
283
+ inner
284
+ insofar
285
+ instead
286
+ into
287
+ invention
288
+ inward
289
+ is
290
+ isn't
291
+ it
292
+ itd
293
+ it'd
294
+ it'll
295
+ its
296
+ it's
297
+ itself
298
+ i've
299
+ j
300
+ just
301
+ k
302
+ keep
303
+ keeps
304
+ kept
305
+ kg
306
+ km
307
+ know
308
+ known
309
+ knows
310
+ l
311
+ largely
312
+ last
313
+ lately
314
+ later
315
+ latter
316
+ latterly
317
+ least
318
+ less
319
+ lest
320
+ let
321
+ lets
322
+ let's
323
+ like
324
+ liked
325
+ likely
326
+ line
327
+ little
328
+ 'll
329
+ look
330
+ looking
331
+ looks
332
+ ltd
333
+ m
334
+ made
335
+ mainly
336
+ make
337
+ makes
338
+ many
339
+ may
340
+ maybe
341
+ me
342
+ mean
343
+ means
344
+ meantime
345
+ meanwhile
346
+ merely
347
+ mg
348
+ might
349
+ million
350
+ miss
351
+ ml
352
+ more
353
+ moreover
354
+ most
355
+ mostly
356
+ mr
357
+ mrs
358
+ much
359
+ mug
360
+ must
361
+ mustn't
362
+ my
363
+ myself
364
+ n
365
+ na
366
+ name
367
+ namely
368
+ nay
369
+ nd
370
+ near
371
+ nearly
372
+ necessarily
373
+ necessary
374
+ need
375
+ needs
376
+ neither
377
+ never
378
+ nevertheless
379
+ new
380
+ next
381
+ nine
382
+ ninety
383
+ no
384
+ nobody
385
+ non
386
+ none
387
+ nonetheless
388
+ noone
389
+ nor
390
+ normally
391
+ nos
392
+ not
393
+ noted
394
+ nothing
395
+ novel
396
+ now
397
+ nowhere
398
+ o
399
+ obtain
400
+ obtained
401
+ obviously
402
+ of
403
+ off
404
+ often
405
+ oh
406
+ ok
407
+ okay
408
+ old
409
+ omitted
410
+ on
411
+ once
412
+ one
413
+ ones
414
+ only
415
+ onto
416
+ or
417
+ ord
418
+ other
419
+ others
420
+ otherwise
421
+ ought
422
+ our
423
+ ours
424
+ ourselves
425
+ out
426
+ outside
427
+ over
428
+ overall
429
+ owing
430
+ own
431
+ p
432
+ page
433
+ pages
434
+ part
435
+ particular
436
+ particularly
437
+ past
438
+ per
439
+ perhaps
440
+ placed
441
+ please
442
+ plus
443
+ poorly
444
+ possible
445
+ possibly
446
+ potentially
447
+ pp
448
+ predominantly
449
+ present
450
+ presumably
451
+ previously
452
+ primarily
453
+ probably
454
+ promptly
455
+ proud
456
+ provides
457
+ put
458
+ q
459
+ que
460
+ quickly
461
+ quite
462
+ qv
463
+ r
464
+ ran
465
+ rather
466
+ rd
467
+ re
468
+ readily
469
+ really
470
+ reasonably
471
+ recent
472
+ recently
473
+ ref
474
+ refs
475
+ regarding
476
+ regardless
477
+ regards
478
+ related
479
+ relatively
480
+ research
481
+ respectively
482
+ resulted
483
+ resulting
484
+ results
485
+ right
486
+ run
487
+ s
488
+ said
489
+ same
490
+ saw
491
+ say
492
+ saying
493
+ says
494
+ sec
495
+ second
496
+ secondly
497
+ section
498
+ see
499
+ seeing
500
+ seem
501
+ seemed
502
+ seeming
503
+ seems
504
+ seen
505
+ self
506
+ selves
507
+ sensible
508
+ sent
509
+ serious
510
+ seriously
511
+ seven
512
+ several
513
+ shall
514
+ shan't
515
+ she
516
+ shed
517
+ she'd
518
+ she'll
519
+ shes
520
+ she's
521
+ should
522
+ shouldn't
523
+ show
524
+ showed
525
+ shown
526
+ showns
527
+ shows
528
+ significant
529
+ significantly
530
+ similar
531
+ similarly
532
+ since
533
+ six
534
+ slightly
535
+ so
536
+ some
537
+ somebody
538
+ somehow
539
+ someone
540
+ somethan
541
+ something
542
+ sometime
543
+ sometimes
544
+ somewhat
545
+ somewhere
546
+ soon
547
+ sorry
548
+ specifically
549
+ specified
550
+ specify
551
+ specifying
552
+ still
553
+ stop
554
+ strongly
555
+ sub
556
+ substantially
557
+ successfully
558
+ such
559
+ sufficiently
560
+ suggest
561
+ sup
562
+ sure
563
+ t
564
+ take
565
+ taken
566
+ taking
567
+ tell
568
+ tends
569
+ th
570
+ than
571
+ thank
572
+ thanks
573
+ thanx
574
+ that
575
+ that'll
576
+ thats
577
+ that's
578
+ that've
579
+ the
580
+ their
581
+ theirs
582
+ them
583
+ themselves
584
+ then
585
+ thence
586
+ there
587
+ thereafter
588
+ thereby
589
+ thered
590
+ therefore
591
+ therein
592
+ there'll
593
+ thereof
594
+ therere
595
+ theres
596
+ there's
597
+ thereto
598
+ thereupon
599
+ there've
600
+ these
601
+ they
602
+ theyd
603
+ they'd
604
+ they'll
605
+ theyre
606
+ they're
607
+ they've
608
+ think
609
+ third
610
+ this
611
+ thorough
612
+ thoroughly
613
+ those
614
+ thou
615
+ though
616
+ thoughh
617
+ thousand
618
+ three
619
+ throug
620
+ through
621
+ throughout
622
+ thru
623
+ thus
624
+ til
625
+ tip
626
+ to
627
+ together
628
+ too
629
+ took
630
+ toward
631
+ towards
632
+ tried
633
+ tries
634
+ truly
635
+ try
636
+ trying
637
+ ts
638
+ t's
639
+ twice
640
+ two
641
+ u
642
+ un
643
+ under
644
+ unfortunately
645
+ unless
646
+ unlike
647
+ unlikely
648
+ until
649
+ unto
650
+ up
651
+ upon
652
+ ups
653
+ us
654
+ use
655
+ used
656
+ useful
657
+ usefully
658
+ usefulness
659
+ uses
660
+ using
661
+ usually
662
+ v
663
+ value
664
+ various
665
+ 've
666
+ very
667
+ via
668
+ viz
669
+ vol
670
+ vols
671
+ vs
672
+ w
673
+ want
674
+ wants
675
+ was
676
+ wasnt
677
+ wasn't
678
+ way
679
+ we
680
+ wed
681
+ we'd
682
+ welcome
683
+ well
684
+ we'll
685
+ went
686
+ were
687
+ we're
688
+ werent
689
+ weren't
690
+ we've
691
+ what
692
+ whatever
693
+ what'll
694
+ whats
695
+ what's
696
+ when
697
+ whence
698
+ whenever
699
+ when's
700
+ where
701
+ whereafter
702
+ whereas
703
+ whereby
704
+ wherein
705
+ wheres
706
+ where's
707
+ whereupon
708
+ wherever
709
+ whether
710
+ which
711
+ while
712
+ whim
713
+ whither
714
+ who
715
+ whod
716
+ whoever
717
+ whole
718
+ who'll
719
+ whom
720
+ whomever
721
+ whos
722
+ who's
723
+ whose
724
+ why
725
+ why's
726
+ widely
727
+ will
728
+ willing
729
+ wish
730
+ with
731
+ within
732
+ without
733
+ wonder
734
+ wont
735
+ won't
736
+ words
737
+ world
738
+ would
739
+ wouldnt
740
+ wouldn't
741
+ www
742
+ x
743
+ y
744
+ yes
745
+ yet
746
+ you
747
+ youd
748
+ you'd
749
+ you'll
750
+ your
751
+ youre
752
+ you're
753
+ yours
754
+ yourself
755
+ yourselves
756
+ you've
757
+ z
758
+ zero
src/mains/candidate_job_match.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from src.text.chunking import Chunk
3
+ from src.utils.compare_metrics import CompareMetrics
4
+ from src.mains.resume_analyzer import ResumeAnalyzer
5
+ from src.text.embeddings import SentEmbeddings
6
+ from src.utils.commonutils import CommonUtils
7
+ from src.text.text_cleaning import TextCleaner
8
+ import configparser
9
+
10
+
11
+ config = configparser.ConfigParser()
12
+ config.read("src/configs/config.cfg")
13
+ candidate_config = config["CANDIDATE"]
14
+
15
+ pointsThreshold = int(candidate_config["RESUME_MATCH_POINT_THRESHOLD"])
16
+ sectionMatchThreshold = float(candidate_config["SECTION_MATCH_POINT_THRESHOLD"])
17
+
18
+ class MatchJobCandidate:
19
+
20
+ def __init__(self) -> None:
21
+ self.compareMetrics = CompareMetrics()
22
+ self.analyzer = ResumeAnalyzer()
23
+ self.chunk = Chunk()
24
+ self.embedding = SentEmbeddings()
25
+ self.utility = CommonUtils()
26
+ self.cleaner = TextCleaner()
27
+ pass
28
+
29
+ def __match(self, jdFile, resumeFile):
30
+
31
+ metric = 0
32
+ jdChunkList = self.chunk.chunk(jdFile)
33
+ resumeChunkList = self.chunk.chunk(resumeFile)
34
+
35
+ jdchunkEmbeddings = self.embedding.computeEmbeddingList(jdChunkList)
36
+ jdresumeEmbeddings = self.embedding.computeEmbeddingList(resumeChunkList)
37
+
38
+ total_compare = len(jdchunkEmbeddings) * len(jdresumeEmbeddings)
39
+
40
+ for i in range(len(jdchunkEmbeddings)):
41
+ for j in range(len(jdresumeEmbeddings)):
42
+ metric += self.compareMetrics.cos_sim(jdchunkEmbeddings[i],jdresumeEmbeddings[j])
43
+
44
+ return round((metric*100)/total_compare,2)
45
+
46
+ pass
47
+
48
+ def __keywordsMatch(self, jdFile, resumeFile):
49
+
50
+ jdtext_list = self.chunk.chunk(jdFile)
51
+ resumeText_list = self.chunk.chunk(resumeFile)
52
+
53
+ keywordsJD=[]
54
+ for jdtext in jdtext_list:
55
+ keywordsJD.extend(self.analyzer.extractKeywords(jdtext))
56
+
57
+ keywordsJD = sorted(list(set(keywordsJD)))
58
+
59
+ keywordsRES = []
60
+ for resumeText in resumeText_list:
61
+ keywordsRES.extend(self.analyzer.extractKeywords(resumeText))
62
+
63
+ keywordsRES = sorted(list(set(keywordsRES)))
64
+ resumeKey = []
65
+ for keyword in keywordsRES:
66
+ if not self.utility.has_numbers(keyword):
67
+ resumeKey.append(keyword)
68
+
69
+ return self.analyzer.keywordsPartialMatch(keywordsJD, keywordsRES), resumeKey
70
+ pass
71
+
72
+
73
+ def generatePointers(self, jodDescFolder, resumeFolder):
74
+ jd_list = os.listdir(jodDescFolder)
75
+ resume_list = os.listdir(resumeFolder)
76
+
77
+ jd_dict = dict()
78
+
79
+ for jd in jd_list:
80
+
81
+ resume_dict = dict()
82
+
83
+ for resume in resume_list:
84
+ jdFile = os.path.join(jodDescFolder, jd)
85
+ resumeFile = os.path.join(resumeFolder, resume)
86
+ metric = self.__match(jdFile, resumeFile)
87
+ resume_dict[resume] = metric
88
+
89
+ jd_dict[jd] = {k: v for k, v in sorted(resume_dict.items(), key=lambda item: item[1], reverse=True)}
90
+
91
+ return jd_dict
92
+ pass
93
+
94
+ def extractJDResumeKeywords(self, jodDescFolder, resumeFolder):
95
+ jd_list = os.listdir(jodDescFolder)
96
+ resume_list = os.listdir(resumeFolder)
97
+
98
+ jd_dict = dict()
99
+
100
+ for jd in jd_list:
101
+
102
+ resume_dict = dict()
103
+
104
+ for resume in resume_list:
105
+ jdFile = os.path.join(jodDescFolder, jd)
106
+ resumeFile = os.path.join(resumeFolder, resume)
107
+ resume_dict[resume], resume_dict[resume]["resume_keywords"] = self.__keywordsMatch(jdFile, resumeFile)
108
+
109
+ jd_dict[jd] = resume_dict
110
+
111
+ return jd_dict
112
+ pass
113
+
114
+ def getJDResumeScore(self, jodDescFolder, resumeFolder):
115
+ jd_list = os.listdir(jodDescFolder)
116
+ resume_list = os.listdir(resumeFolder)
117
+
118
+ jd_dict = dict()
119
+ for jd in jd_list:
120
+ jdText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(jodDescFolder, jd)))
121
+ resume_dict = dict()
122
+ for resume in resume_list:
123
+ resumeText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(resumeFolder, resume)))
124
+ results = self.compareMetrics.get_score(resumeText, jdText)
125
+ resume_dict[resume] = results[0].score
126
+ jd_dict[jd] = resume_dict
127
+
128
+ return jd_dict
129
+
130
+ pass
131
+
132
+ if __name__ == "__main__":
133
+ match = MatchJobCandidate()
134
+ jodDescFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/JDS"
135
+ resumeFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/RESUMES"
136
+ match.run(jodDescFolder, resumeFolder)
137
+ pass
src/mains/resume_analyzer.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.text.text_cleaning import TextCleaner
2
+ from src.text.embeddings import SentEmbeddings
3
+ from src.utils.compare_metrics import CompareMetrics
4
+ import configparser, os
5
+ from src.text.keywords import KeyphraseExtractionPipeline
6
+ from src.text.chunking import Chunk
7
+ from transformers import pipeline
8
+
9
+
10
+ config = configparser.ConfigParser()
11
+ config.read("src/configs/config.cfg")
12
+ analyzer_config = config["ANALYZER"]
13
+
14
+ topKey = float(analyzer_config["TOP_KEYWORDS"])
15
+ maxGram = float(analyzer_config["MAX_KEYWORDS_SIZE"])
16
+ matchThreshold = float(analyzer_config["KEYWORD_MATCH_THRESHOLD"])
17
+ resume_summarizer = analyzer_config["RESUME_SUMMARIZER"]
18
+ maxlength = int(analyzer_config["RESUME_MAXLENGTH"])
19
+ minlength = int(analyzer_config["RESUME_MINLENGTH"])
20
+
21
+ class ResumeAnalyzer:
22
+
23
+ def __init__(self) -> None:
24
+
25
+ self.keywordExtractor = KeyphraseExtractionPipeline()
26
+ self.cleaning = TextCleaner()
27
+ self.embeddings = SentEmbeddings()
28
+ self.compare = CompareMetrics()
29
+ self.chunk = Chunk(chunksize=1000, overlap=100)
30
+ self.summarizer = pipeline("summarization", model=resume_summarizer)
31
+
32
+ pass
33
+
34
+
35
+ def extractKeywords(self, text):
36
+ keywords = self.keywordExtractor(text)
37
+ keylist = []
38
+ for kw in keywords:
39
+ keylist.append(self.cleaning.clean_text(kw))
40
+
41
+ return keylist
42
+ pass
43
+
44
+
45
+ def keywordsPartialMatch(self, jdKeywords, resumeKeywords):
46
+
47
+ jdKeywords = sorted(list(set(jdKeywords)))
48
+ resumeKeywords = sorted(list(set(resumeKeywords)))
49
+
50
+ jdKeywords_embed = self.embeddings.computeEmbeddingList(jdKeywords)
51
+ resumeKeywords_embed = self.embeddings.computeEmbeddingList(resumeKeywords)
52
+
53
+ match_jd_res_key = dict()
54
+
55
+ for i in range(len(jdKeywords)):
56
+ resKeys = []
57
+ for j in range(len(resumeKeywords)):
58
+ metric = self.compare.cos_sim(jdKeywords_embed[i], resumeKeywords_embed[j])
59
+ if metric > matchThreshold:
60
+ resKeys.append(resumeKeywords[j])
61
+
62
+ if resKeys:
63
+ match_jd_res_key[jdKeywords[i]] = resKeys
64
+
65
+ return match_jd_res_key
66
+ pass
67
+
68
+
69
+ def __summarizeBatch(self, textBatch):
70
+ return self.summarizer(textBatch, max_length=maxlength, min_length=minlength, do_sample=False)
71
+ pass
72
+
73
+ def resumeBatchSummarizer(self, resumeFolder):
74
+ resume_list = os.listdir(resumeFolder)
75
+
76
+ resumeSummarize = dict()
77
+
78
+ for resumeFile in resume_list:
79
+ file = os.path.join(resumeFolder, resumeFile)
80
+ resumeChunk_list = self.chunk.chunk(file)
81
+ response = self.__summarizeBatch(resumeChunk_list)
82
+ print(response)
83
+ summarize = ""
84
+ for summary in response:
85
+ summarize += " "+str(summary['summary_text'])
86
+ resumeSummarize[resumeFile] = summarize
87
+
88
+ return resumeSummarize
89
+ pass
90
+
91
+ pass
src/mains/resume_metadata.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re, os
3
+ from pdfminer.high_level import extract_text
4
+ import spacy
5
+ from spacy.matcher import Matcher
6
+ from src.utils.commonutils import CommonUtils
7
+ from src.mains.resume_analyzer import ResumeAnalyzer
8
+
9
+ class ResumeMetaData():
10
+
11
+ def __init__(self) -> None:
12
+ self.utils = CommonUtils()
13
+ self.analyzer = ResumeAnalyzer()
14
+ pass
15
+
16
+
17
+ def extract_text_from_pdf(self, pdf_path):
18
+ return extract_text(pdf_path)
19
+
20
+
21
+ def extract_contact_number_from_resume(self, text):
22
+ contact_number = None
23
+ # Use regex pattern to find a potential contact number
24
+ pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
25
+ tmp = re.findall(pattern,text)
26
+ r1 = '[^0-9]+'
27
+ contact_number_list = []
28
+ for con in tmp:
29
+ contact_number_list.append(re.sub(r1, "", con)[-10:])
30
+
31
+ contact_number = ", ".join(contact_number_list)
32
+
33
+ return contact_number
34
+
35
+
36
+ def extract_email_from_resume(self, text):
37
+ email = None
38
+ # Use regex pattern to find a potential email address
39
+ pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
40
+ email = ", ".join(re.findall(pattern,text))
41
+ return email
42
+
43
+
44
+ def extract_education_from_resume(self, text):
45
+ education = []
46
+
47
+ # Use regex pattern to find education information
48
+ pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
49
+ matches = re.findall(pattern, text)
50
+ for match in matches:
51
+ education.append(match.strip())
52
+
53
+ return education
54
+
55
+
56
+ def extract_name(self, resume_text):
57
+ nlp = spacy.load('en_core_web_lg')
58
+ matcher = Matcher(nlp.vocab)
59
+
60
+ # Define name patterns
61
+ patterns = [
62
+ [{'POS': 'PROPN'}, {'POS': 'PROPN'}], # First name and Last name
63
+ [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}], # First name, Middle name, and Last name
64
+ [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}] # First name, Middle name, Middle name, and Last name
65
+ # Add more patterns as needed
66
+ ]
67
+
68
+ for pattern in patterns:
69
+ matcher.add('NAME', patterns=[pattern])
70
+
71
+ doc = nlp(resume_text)
72
+ matches = matcher(doc)
73
+
74
+ for match_id, start, end in matches:
75
+ span = doc[start:end]
76
+ return span.text
77
+
78
+ return None
79
+
80
+ def extract_links_extended(self, text):
81
+ links = []
82
+ pattern = r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b'
83
+ links = re.findall(pattern, text)
84
+ for link in links:
85
+ if "/" not in link:
86
+ links.remove(link)
87
+ return links
88
+
89
+
90
+ def extract_keywords(self, text):
91
+ return self.analyzer.extractKeywords(text)
92
+
93
+ def extractMetaData(self, resumeFolder):
94
+
95
+ resume_list = os.listdir(resumeFolder)
96
+ resume_info = dict()
97
+
98
+ for resume in resume_list:
99
+ print(resume)
100
+ meta_data = dict()
101
+ resume_path = os.path.join(resumeFolder, resume)
102
+ text = self.extract_text_from_pdf(resume_path)
103
+
104
+
105
+ name = self.extract_name(text)
106
+ if name:
107
+ meta_data["Name"] = name
108
+ else:
109
+ meta_data["Name"] = ""
110
+
111
+
112
+ contact_number = self.extract_contact_number_from_resume(text)
113
+ if contact_number:
114
+ meta_data["Contact Number"] = contact_number
115
+ else:
116
+ meta_data["Contact Number"] = ""
117
+
118
+
119
+ email = self.extract_email_from_resume(text)
120
+ if email:
121
+ meta_data["Email"] = email
122
+ else:
123
+ print("Email not found")
124
+
125
+
126
+ extracted_education = self.extract_education_from_resume(text)
127
+ if extracted_education:
128
+ meta_data["Education"] = extracted_education
129
+ else:
130
+ meta_data["Education"] = ""
131
+
132
+
133
+ extracted_links = self.extract_links_extended(text)
134
+ if extracted_education:
135
+ meta_data["Links"] = extracted_links
136
+ else:
137
+ meta_data["Links"] = ""
138
+
139
+
140
+ extracted_keywords = self.extract_keywords(text)
141
+ if extracted_keywords:
142
+ meta_data["Skills"] = extracted_keywords
143
+ else:
144
+ meta_data["Skills"] = ""
145
+
146
+
147
+ resume_info[resume] = meta_data
148
+
149
+ return resume_info
150
+
151
+ pass
152
+
153
+
154
+ if __name__ == '__main__':
155
+
156
+ resumeFolder = "D:/Study Material/Projects/HR Assist/Code/test_data/RESUMES"
157
+
158
+ metadata = ResumeMetaData()
159
+ info = metadata.extractMetaData(resumeFolder)
160
+
161
+ print(info)
162
+
163
+ pass
src/text/chunking.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ from semantic_text_splitter import TextSplitter
3
+ import configparser
4
+
5
+ config = configparser.ConfigParser()
6
+ config.read("src/configs/config.cfg")
7
+ chunk_config = config["CHUNKING"]
8
+
9
+
10
+ class Chunk:
11
+ def __init__(self, chunksize=int(chunk_config["CHUNK_SIZE"]), overlap=int(chunk_config["CHUNK_OVERLAP"])) -> None:
12
+ self.splitter = TextSplitter(capacity=chunksize, overlap=overlap)
13
+
14
+ def chunk(self, inputFileLoc) -> list:
15
+ doc = fitz.open(inputFileLoc)
16
+
17
+ text = ""
18
+ for page in doc:
19
+ text += " "+ page.get_text()
20
+
21
+ chunks = self.splitter.chunks(text)
22
+
23
+ return chunks
24
+
25
+ def getTextFromPdf(self, inputFileLoc) -> list:
26
+ doc = fitz.open(inputFileLoc)
27
+
28
+ text = ""
29
+ for page in doc:
30
+ text += " "+ page.get_text()
31
+
32
+ return text
33
+
34
+
35
+
36
+
37
+ if __name__ == "__main__":
38
+ input_file = '../test_data/RESUMES/AnanyaDasResume.pdf'
39
+ chunker = Chunk()
40
+ print(chunker.chunk(input_file))
src/text/embeddings.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from src.text.text_cleaning import TextCleaner
3
+ import configparser
4
+
5
+ config = configparser.ConfigParser()
6
+ config.read("src/configs/config.cfg")
7
+ embed_config = config["EMBEDDINGS"]
8
+
9
+
10
+ class SentEmbeddings():
11
+
12
+ def __init__(self) -> None:
13
+ self.model = SentenceTransformer(embed_config['SENTENCE_TRANSFORMER'], trust_remote_code=True, device='cuda')
14
+ pass
15
+
16
+ def computeEmbedding(self, sentence):
17
+ cleaner = TextCleaner()
18
+ clean_sent = cleaner.clean_text(sentence)
19
+ return self.model.encode(clean_sent)
20
+ pass
21
+
22
+ def computeEmbeddingList(self, sentenceList):
23
+ cleaner = TextCleaner()
24
+ cleaned_sentList = []
25
+ for i in range(len(sentenceList)):
26
+ cleaned_sentList.append(cleaner.clean_text(sentenceList[i]))
27
+ return self.model.encode(cleaned_sentList)
28
+ pass
29
+
30
+ pass
31
+
32
+
33
+ if __name__ == "__main__":
34
+ embed = SentEmbeddings()
35
+ test_sent = """This isn't a panda,,,, you are wrong this is a well versed bear ..
36
+ which you'll never understand!!!!!!!!!!!!!!!!"""
37
+ embedding = embed.computeEmbedding(test_sent)
38
+ print(embedding)
39
+ pass
src/text/keywords.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TokenClassificationPipeline, AutoModelForTokenClassification, AutoTokenizer
2
+ from transformers.pipelines import AggregationStrategy
3
+ import numpy as np
4
+ import configparser
5
+
6
+ config = configparser.ConfigParser()
7
+ config.read("src/configs/config.cfg")
8
+ embed_config = config["EMBEDDINGS"]
9
+
10
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
11
+
12
+ def __init__(self,):
13
+ super().__init__(
14
+ model=AutoModelForTokenClassification.from_pretrained(str(embed_config["KEYWORD_EXTRACTOR"])),
15
+ tokenizer=AutoTokenizer.from_pretrained(embed_config["KEYWORD_EXTRACTOR"], device_map = 'cuda')
16
+ )
17
+
18
+ def postprocess(self, all_outputs):
19
+ results = super().postprocess(
20
+ all_outputs=all_outputs,
21
+ aggregation_strategy=AggregationStrategy.FIRST,
22
+ )
23
+ return np.unique([result.get("word").strip() for result in results])
src/text/text_cleaning.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.stem import WordNetLemmatizer
3
+ from src.utils.commonutils import CommonUtils
4
+
5
+ class TextCleaner:
6
+
7
+ def __init__(self) -> None:
8
+ self.lemmatizer = WordNetLemmatizer()
9
+ self.comonUtils = CommonUtils()
10
+ self.stopwords = self.comonUtils.loadStropwords()
11
+ self.abbr_words = self.comonUtils.loadAbbreviations()
12
+ pass
13
+
14
+ def __remove_html_tags(self, text):
15
+ clean_text = re.sub(r'<.*?>', '', text)
16
+ return clean_text
17
+
18
+ def __remove_special_characters(self, text):
19
+ clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
20
+ return clean_text
21
+
22
+ def __convert_to_lowercase(self, text):
23
+ lowercased_text = text.lower()
24
+ return lowercased_text
25
+
26
+ def __change_abbr(self, text):
27
+ abbreviation = ' '.join([self.abbr_words[t] if t in self.abbr_words else t for t in text.split(" ")])
28
+ return abbreviation
29
+
30
+ def __remove_whitespace(self, text):
31
+ cleaned_text = ' '.join(text.split())
32
+ return cleaned_text
33
+
34
+ def __lemmatize_text(self, tokens):
35
+ lemmatized_tokens = ' '.join([self.lemmatizer.lemmatize(word) for word in tokens.split()])
36
+ return lemmatized_tokens
37
+
38
+ def remove_stopwords(self, tokens):
39
+ filtered_tokens = ' '.join([word for word in tokens.split() if word not in self.stopwords])
40
+ return filtered_tokens
41
+
42
+ def remove_numbers(self, text):
43
+ result = re.sub(r'[0-9]+', ' ', text)
44
+ result = self.__remove_whitespace(result)
45
+ return result
46
+
47
+ def clean_text(self, text):
48
+ sentence = self.__remove_html_tags(text)
49
+ sentence = self.__change_abbr(sentence)
50
+ sentence = self.__lemmatize_text(sentence)
51
+ sentence = self.__remove_special_characters(sentence)
52
+ sentence = self.__convert_to_lowercase(sentence)
53
+ sentence = self.__remove_whitespace(sentence)
54
+ return sentence
55
+ pass
src/utils/commonutils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, re
2
+ from datetime import datetime
3
+ from dateutil import relativedelta
4
+
5
+ class CommonUtils:
6
+
7
+ def __init__(self) -> None:
8
+ pass
9
+
10
+ def loadStropwords(self,):
11
+ with open(os.path.join("src", "configs", "stopwords.txt"), "r") as g:
12
+ stopwords = g.read().splitlines()
13
+ return stopwords
14
+
15
+ def loadAbbreviations(self,):
16
+ with open(os.path.join("src", "configs", "abbr.json"), "r") as json_file:
17
+ data = json.load(json_file)
18
+ return data
19
+
20
+ def has_numbers(self, inputString):
21
+ return bool(re.search(r'\d', inputString))
22
+
23
+
24
+ def get_number_of_months_from_dates(date1, date2):
25
+ if date2.lower() == 'present':
26
+ date2 = datetime.now().strftime('%b %Y')
27
+ try:
28
+ if len(date1.split()[0]) > 3:
29
+ date1 = date1.split()
30
+ date1 = date1[0][:3] + ' ' + date1[1]
31
+ if len(date2.split()[0]) > 3:
32
+ date2 = date2.split()
33
+ date2 = date2[0][:3] + ' ' + date2[1]
34
+ except IndexError:
35
+ return 0
36
+ try:
37
+ date1 = datetime.strptime(str(date1), '%b %Y')
38
+ date2 = datetime.strptime(str(date2), '%b %Y')
39
+ months_of_experience = relativedelta.relativedelta(date2, date1)
40
+ months_of_experience = (months_of_experience.years
41
+ * 12 + months_of_experience.months)
42
+ except ValueError:
43
+ return 0
44
+ return months_of_experience
45
+
46
+ pass
47
+
48
+
49
+
50
+
51
+ if __name__ == "__main__":
52
+
53
+ cu = CommonUtils()
54
+ print(type(cu.loadAbbreviations()))
55
+ print(cu.loadAbbreviations())
56
+
57
+
58
+ pass
src/utils/compare_metrics.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sentence_transformers import util
3
+ from src.text.embeddings import SentEmbeddings
4
+ from src.text.text_cleaning import TextCleaner
5
+ from typing import List
6
+ from qdrant_client import QdrantClient
7
+ import configparser
8
+
9
+ config = configparser.ConfigParser()
10
+ config.read("src/configs/config.cfg")
11
+ embed_config = config["EMBEDDINGS"]
12
+
13
+ class CompareMetrics:
14
+
15
+ def __init__(self) -> None:
16
+ self.sentEmbedding = SentEmbeddings()
17
+ self.textCleaner = TextCleaner()
18
+ pass
19
+
20
+ def dot_score(self, emb1, emb2):
21
+ return round(util.dot_score(emb1, emb2).numpy()[0][0].tolist(),2)
22
+
23
+ def cos_sim(self, emb1, emb2):
24
+ return round(util.cos_sim(emb1, emb2).numpy()[0][0].tolist(),2)
25
+
26
+ def calculate_similarity(self, sent1, sent2):
27
+ metrics = dict()
28
+ cleaned_sent1 = self.textCleaner.clean_text(sent1)
29
+ cleaned_sent2 = self.textCleaner.clean_text(sent2)
30
+
31
+ emb1 = self.sentEmbedding.computeEmbedding(cleaned_sent1)
32
+ emb2 = self.sentEmbedding.computeEmbedding(cleaned_sent2)
33
+ metrics['dot_score'] = self.dot_score(emb1, emb2)
34
+ metrics['cos_sim'] = self.cos_sim(emb1, emb2)
35
+
36
+ ## sending only cos_sim as both are same
37
+ return metrics['cos_sim']
38
+
39
+
40
+ def get_score(self, resume_string, job_description_string):
41
+
42
+ documents: List[str] = [resume_string]
43
+ client = QdrantClient(":memory:")
44
+ client.set_model(embed_config['SCORING_EMBED'])
45
+
46
+ client.add(
47
+ collection_name="demo_collection",
48
+ documents=documents,
49
+ )
50
+
51
+ search_result = client.query(
52
+ collection_name="demo_collection", query_text=job_description_string
53
+ )
54
+
55
+ return search_result
56
+
57
+ pass
src/utils/scout.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import textwrap
3
+
4
+ def to_markdown(text):
5
+ text = text.replace('•', ' *')
6
+ return textwrap.indent(text, '> ', predicate=lambda _: True)
7
+
8
+ GOOGLE_API_KEY="AIzaSyDXgb_tauJ6Au_puSi0Lqht1nRuFskOkHQ" #userdata.get('GOOGLE_API_KEY')
9
+ genai.configure(api_key=GOOGLE_API_KEY)
10
+
11
+
12
+ for m in genai.list_models():
13
+ if 'generateContent' in m.supported_generation_methods:
14
+ print(m.name)
15
+
16
+
17
+ model = genai.GenerativeModel('gemini-pro',)
18
+
19
+ response = model.generate_content("What is the meaning of life?")
20
+
21
+ print(response.text)
static/scripts.js ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ document.getElementById('compare-button').addEventListener('click', function() {
5
+ var jdfiles = document.getElementById('jd');
6
+ var resfiles = document.getElementById('resume');
7
+
8
+ document.getElementById('comparison-output').innerText = 'Genarating Response..';
9
+
10
+ if (jdfiles.value.length < 1 || resfiles.value.length < 1) {
11
+ alert("Please select pdf to upload..");
12
+ return false;
13
+ }
14
+ else if(jdfiles.files.length > 1){
15
+ alert("Max 1 file can be uploaded in JD.");
16
+ return false;
17
+ }
18
+ else if(resfiles.files.length > 5){
19
+ alert("Max 5 files can be uploaded in Resume.");
20
+ return false;
21
+ }
22
+
23
+ const formData = new FormData();
24
+
25
+ for (var x = 0; x < jdfiles.files.length; x++) {
26
+ formData.append("jdfiles", jdfiles.files[x]);
27
+ }
28
+
29
+ for (var x = 0; x < resfiles.files.length; x++) {
30
+ formData.append("resfiles", resfiles.files[x]);
31
+ }
32
+
33
+ formData.append('jdfiles', jdfiles.files[0]);
34
+ formData.append('resfiles', resfiles.files[0]);
35
+
36
+ fetch('http://127.0.0.1:8080/summarize_resume', {
37
+ method: 'POST',
38
+ body: formData
39
+ })
40
+ .then(response => response.json())
41
+ .then(data => {
42
+ document.getElementById('comparison-output').innerText = JSON.stringify(data, null, 2);
43
+ })
44
+ .catch(error => {
45
+ console.error('Error:', error);
46
+ document.getElementById('comparison-output').innerText = 'An error occurred during comparison.';
47
+ });
48
+ });
49
+
50
+
51
+ document.getElementById('clear-button').addEventListener('click', function() {
52
+ document.getElementById('upload-form').reset();
53
+ document.getElementById('comparison-output').innerText = '';
54
+ });
static/styles.css ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
3
+ background: linear-gradient(0.25turn, #3f87a6, #ebf8e1, #f69d3c);
4
+ margin: 0;
5
+ padding: 0;
6
+ display: flex;
7
+ justify-content: center;
8
+ align-items: center;
9
+ height: 100vh;
10
+ color: #fff;
11
+ }
12
+
13
+ .container {
14
+ background: #fff;
15
+ padding: 40px 50px;
16
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
17
+ border-radius: 10px;
18
+ text-align: center;
19
+ width: 90%;
20
+ max-width: 500px;
21
+ color: #333;
22
+ }
23
+
24
+ h1 {
25
+ margin-bottom: 25px;
26
+ font-size: 2em;
27
+ color: #800020;
28
+ }
29
+
30
+ .file-input {
31
+ margin-bottom: 20px;
32
+ }
33
+
34
+ label {
35
+ display: block;
36
+ margin-bottom: 10px;
37
+ font-weight: bold;
38
+ font-size: 1.1em;
39
+ color: #800020;
40
+ }
41
+
42
+ input[type="file"] {
43
+ width: 100%;
44
+ padding: 10px;
45
+ border: 2px solid #ddd;
46
+ border-radius: 5px;
47
+ font-size: 1em;
48
+ transition: border-color 0.3s ease;
49
+ }
50
+
51
+ input[type="file"]:focus {
52
+ border-color: #800020;
53
+ outline: none;
54
+ }
55
+
56
+ button {
57
+ padding: 12px 25px;
58
+ background: #800020;
59
+ color: #fff;
60
+ border: none;
61
+ border-radius: 25px;
62
+ cursor: pointer;
63
+ font-size: 1em;
64
+ transition: background 0.3s ease, transform 0.3s ease;
65
+ box-shadow: 0 5px 15px rgba(128, 0, 32, 0.2);
66
+ }
67
+
68
+ button:hover {
69
+ background: #4c0014;
70
+ transform: translateY(-2px);
71
+ }
72
+
73
+ #results {
74
+ margin-top: 35px;
75
+ }
76
+
77
+ #comparison-output {
78
+ padding: 20px;
79
+ background: #f1f1f1;
80
+ border-radius: 5px;
81
+ box-shadow: inset 0 0 10px rgba(0, 0, 0, 0.1);
82
+ text-align: left;
83
+ white-space: pre-wrap;
84
+ }
templates/index.html ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Talent Scout AI</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <h1>Talent Scout AI</h1>
12
+ <form id="upload-form">
13
+ <div class="file-input">
14
+ <label for="jd">Upload JD:</label>
15
+ <input type="file" id="jd" accept="application/pdf" required multiple>
16
+ </div>
17
+ <div class="file-input">
18
+ <label for="resume">Upload RESUME:</label>
19
+ <input type="file" id="resume" accept="application/pdf" required multiple>
20
+ </div>
21
+ <button type="button" id="compare-button">Compare</button>
22
+ <button type="button" id="clear-button">Clear All</button>
23
+ </form>
24
+ <div id="results">
25
+ <h2>Comparison Results</h2>
26
+ <div id="comparison-output" style="overflow-y: scroll; height:200px;"></div>
27
+ </div>
28
+ </div>
29
+ <script src="{{ url_for('static', filename='scripts.js') }}"></script>
30
+ </body>
31
+ </html>