Spaces:
Sleeping
Sleeping
bishalbose294
commited on
Commit
·
775f69c
1
Parent(s):
a7e7c48
initial commit
Browse files- .gitignore +5 -0
- Dockerfile +22 -0
- README.md +1 -11
- app.py +103 -0
- requirements.txt +10 -0
- src/configs/abbr.json +122 -0
- src/configs/config.cfg +20 -0
- src/configs/stopwords.txt +758 -0
- src/mains/candidate_job_match.py +137 -0
- src/mains/resume_analyzer.py +91 -0
- src/mains/resume_metadata.py +163 -0
- src/text/chunking.py +40 -0
- src/text/embeddings.py +39 -0
- src/text/keywords.py +23 -0
- src/text/text_cleaning.py +55 -0
- src/utils/commonutils.py +58 -0
- src/utils/compare_metrics.py +57 -0
- src/utils/scout.py +21 -0
- static/scripts.js +54 -0
- static/styles.css +84 -0
- templates/index.html +31 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*env/
|
2 |
+
*pycache*/
|
3 |
+
*test_data/
|
4 |
+
uploads/*
|
5 |
+
test_file.py
|
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10.11
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
COPY ./packages.txt /code/packages.txt
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
10 |
+
|
11 |
+
COPY . .
|
12 |
+
|
13 |
+
ENV TRANSFORMERS_CACHE=/code/hf_model
|
14 |
+
ENV HF_HOME=/code/hf_model
|
15 |
+
ENV HF_DATASETS_CACHE=/code/hf_model
|
16 |
+
ENV XDG_CACHE_HOME=/code/hf_model
|
17 |
+
|
18 |
+
RUN chmod -R 777 .
|
19 |
+
|
20 |
+
EXPOSE 7860
|
21 |
+
|
22 |
+
CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,11 +1 @@
|
|
1 |
-
|
2 |
-
title: TalentScoutAI
|
3 |
-
emoji: 🌖
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: gray
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
-
license: unknown
|
9 |
-
---
|
10 |
-
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# Talent-Scout-AI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, redirect, url_for, render_template, request, jsonify
|
2 |
+
from flask_cors import CORS
|
3 |
+
import simplejson as json
|
4 |
+
import os, time, traceback
|
5 |
+
import shutil
|
6 |
+
from src.mains.candidate_job_match import MatchJobCandidate
|
7 |
+
from src.mains.resume_analyzer import ResumeAnalyzer
|
8 |
+
from gevent.pywsgi import WSGIServer
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
CORS(app=app)
|
12 |
+
|
13 |
+
cwd = os.getcwd()
|
14 |
+
app.config["ALLOWED_EXTENSIONS"] = [".pdf"]
|
15 |
+
app.config["MAX_CONTENT_LENGTH"] = 25 * 1024 * 1024 # 25 MB
|
16 |
+
app.config["UPLOAD_FOLDER"] = os.path.join(cwd, "uploads")
|
17 |
+
|
18 |
+
methods = ['GET','POST']
|
19 |
+
|
20 |
+
def home():
|
21 |
+
return render_template('index.html')
|
22 |
+
|
23 |
+
app.add_url_rule('/', 'home', home, methods=methods)
|
24 |
+
|
25 |
+
def calculate_scores():
|
26 |
+
try:
|
27 |
+
timestr = time.strftime("%Y%m%d_%H%M%S")
|
28 |
+
jds_folder = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"jds")
|
29 |
+
os.makedirs(jds_folder)
|
30 |
+
res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
|
31 |
+
os.makedirs(res_foler)
|
32 |
+
|
33 |
+
|
34 |
+
jdfiles = request.files.getlist("jdfiles")
|
35 |
+
for file in jdfiles:
|
36 |
+
filePath = os.path.join(jds_folder, file.filename)
|
37 |
+
file.save(filePath)
|
38 |
+
|
39 |
+
resumefiles = request.files.getlist("resfiles")
|
40 |
+
for file in resumefiles:
|
41 |
+
filePath = os.path.join(res_foler, file.filename)
|
42 |
+
file.save(filePath)
|
43 |
+
|
44 |
+
match = MatchJobCandidate()
|
45 |
+
pointers = match.generatePointers(jds_folder, res_foler)
|
46 |
+
keywords = match.extractJDResumeKeywords(jds_folder, res_foler)
|
47 |
+
|
48 |
+
final_dict = dict()
|
49 |
+
|
50 |
+
for jd, resumePointers in pointers.items():
|
51 |
+
temp_dict = dict()
|
52 |
+
for resume, points in resumePointers.items():
|
53 |
+
temp_dict[resume] = {
|
54 |
+
'points' : points,
|
55 |
+
'keywords' : keywords[jd][resume],
|
56 |
+
}
|
57 |
+
final_dict[jd] = temp_dict
|
58 |
+
|
59 |
+
return json.dumps(final_dict)
|
60 |
+
|
61 |
+
except Exception as ex:
|
62 |
+
print("Exception: ",ex.with_traceback)
|
63 |
+
print(traceback.format_exc())
|
64 |
+
return jsonify({"error": str(ex)})
|
65 |
+
finally:
|
66 |
+
shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
|
67 |
+
|
68 |
+
app.add_url_rule("/calculate_scores", 'calculate_scores', calculate_scores, methods=methods)
|
69 |
+
|
70 |
+
def summarize_resume():
|
71 |
+
try:
|
72 |
+
timestr = time.strftime("%Y%m%d_%H%M%S")
|
73 |
+
|
74 |
+
res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
|
75 |
+
os.makedirs(res_foler)
|
76 |
+
|
77 |
+
resumefiles = request.files.getlist("resfiles")
|
78 |
+
for file in resumefiles:
|
79 |
+
filePath = os.path.join(res_foler, file.filename)
|
80 |
+
file.save(filePath)
|
81 |
+
|
82 |
+
resumeAnalyze = ResumeAnalyzer()
|
83 |
+
response = resumeAnalyze.resumeBatchSummarizer(res_foler)
|
84 |
+
|
85 |
+
return json.dumps(response)
|
86 |
+
|
87 |
+
except Exception as ex:
|
88 |
+
print("Exception: ",ex.with_traceback)
|
89 |
+
print(traceback.format_exc())
|
90 |
+
return jsonify({"error": str(ex)})
|
91 |
+
finally:
|
92 |
+
shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
|
93 |
+
pass
|
94 |
+
|
95 |
+
app.add_url_rule("/summarize_resume", 'summarize_resume', summarize_resume, methods=methods)
|
96 |
+
|
97 |
+
if __name__ == '__main__':
|
98 |
+
host = '0.0.0.0'
|
99 |
+
port = 7860
|
100 |
+
print("#"*50,"--Application Serving Now--","#"*50)
|
101 |
+
# app.run(host=host,port=port)
|
102 |
+
app_serve = WSGIServer((host,port),app)
|
103 |
+
app_serve.serve_forever()
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pymupdf
|
2 |
+
Flask==2.3.2
|
3 |
+
Flask_Cors==4.0.0
|
4 |
+
nltk==3.8.1
|
5 |
+
protobuf==3.19.3
|
6 |
+
semantic_text_splitter==0.13.1
|
7 |
+
sentence_transformers==2.2.2
|
8 |
+
simplejson==3.19.1
|
9 |
+
transformers
|
10 |
+
gevent
|
src/configs/abbr.json
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"ain't": "is not",
|
3 |
+
"aren't": "are not",
|
4 |
+
"can't": "cannot",
|
5 |
+
"'cause": "because",
|
6 |
+
"could've": "could have",
|
7 |
+
"couldn't": "could not",
|
8 |
+
"didn't": "did not",
|
9 |
+
"doesn't": "does not",
|
10 |
+
"don't": "do not",
|
11 |
+
"hadn't": "had not",
|
12 |
+
"hasn't": "has not",
|
13 |
+
"haven't": "have not",
|
14 |
+
"he'd": "he would",
|
15 |
+
"he'll": "he will",
|
16 |
+
"he's": "he is",
|
17 |
+
"how'd": "how did",
|
18 |
+
"how'd'y": "how do you",
|
19 |
+
"how'll": "how will",
|
20 |
+
"how's": "how is",
|
21 |
+
"I'd": "I would",
|
22 |
+
"I'd've": "I would have",
|
23 |
+
"I'll": "I will",
|
24 |
+
"I'll've": "I will have",
|
25 |
+
"I'm": "I am",
|
26 |
+
"I've": "I have",
|
27 |
+
"i'd": "i would",
|
28 |
+
"i'd've": "i would have",
|
29 |
+
"i'll": "i will",
|
30 |
+
"i'll've": "i will have",
|
31 |
+
"i'm": "i am",
|
32 |
+
"i've": "i have",
|
33 |
+
"isn't": "is not",
|
34 |
+
"it'd": "it would",
|
35 |
+
"it'd've": "it would have",
|
36 |
+
"it'll": "it will",
|
37 |
+
"it'll've": "it will have",
|
38 |
+
"it's": "it is",
|
39 |
+
"let's": "let us",
|
40 |
+
"ma'am": "madam",
|
41 |
+
"mayn't": "may not",
|
42 |
+
"might've": "might have",
|
43 |
+
"mightn't": "might not",
|
44 |
+
"mightn't've": "might not have",
|
45 |
+
"must've": "must have",
|
46 |
+
"mustn't": "must not",
|
47 |
+
"mustn't've": "must not have",
|
48 |
+
"needn't": "need not",
|
49 |
+
"needn't've": "need not have",
|
50 |
+
"o'clock": "of the clock",
|
51 |
+
"oughtn't": "ought not",
|
52 |
+
"oughtn't've": "ought not have",
|
53 |
+
"shan't": "shall not",
|
54 |
+
"sha'n't": "shall not",
|
55 |
+
"shan't've": "shall not have",
|
56 |
+
"she'd": "she would",
|
57 |
+
"she'd've": "she would have",
|
58 |
+
"she'll": "she will",
|
59 |
+
"she'll've": "she will have",
|
60 |
+
"she's": "she is",
|
61 |
+
"should've": "should have",
|
62 |
+
"shouldn't": "should not",
|
63 |
+
"shouldn't've": "should not have",
|
64 |
+
"so've": "so have",
|
65 |
+
"so's": "so as",
|
66 |
+
"this's": "this is",
|
67 |
+
"that'd": "that would",
|
68 |
+
"that'd've": "that would have",
|
69 |
+
"that's": "that is",
|
70 |
+
"there'd": "there would",
|
71 |
+
"there'd've": "there would have",
|
72 |
+
"there's": "there is",
|
73 |
+
"here's": "here is",
|
74 |
+
"they'd": "they would",
|
75 |
+
"they'd've": "they would have",
|
76 |
+
"they'll": "they will",
|
77 |
+
"they'll've": "they will have",
|
78 |
+
"they're": "they are",
|
79 |
+
"they've": "they have",
|
80 |
+
"to've": "to have",
|
81 |
+
"wasn't": "was not",
|
82 |
+
"we'd": "we would",
|
83 |
+
"we'd've": "we would have",
|
84 |
+
"we'll": "we will",
|
85 |
+
"we'll've": "we will have",
|
86 |
+
"we're": "we are",
|
87 |
+
"we've": "we have",
|
88 |
+
"weren't": "were not",
|
89 |
+
"what'll": "what will",
|
90 |
+
"what'll've": "what will have",
|
91 |
+
"what're": "what are",
|
92 |
+
"what's": "what is",
|
93 |
+
"what've": "what have",
|
94 |
+
"when's": "when is",
|
95 |
+
"when've": "when have",
|
96 |
+
"where'd": "where did",
|
97 |
+
"where's": "where is",
|
98 |
+
"where've": "where have",
|
99 |
+
"who'll": "who will",
|
100 |
+
"who'll've": "who will have",
|
101 |
+
"who's": "who is",
|
102 |
+
"who've": "who have",
|
103 |
+
"why's": "why is",
|
104 |
+
"why've": "why have",
|
105 |
+
"will've": "will have",
|
106 |
+
"won't": "will not",
|
107 |
+
"won't've": "will not have",
|
108 |
+
"would've": "would have",
|
109 |
+
"wouldn't": "would not",
|
110 |
+
"wouldn't've": "would not have",
|
111 |
+
"y'all": "you all",
|
112 |
+
"y'all'd": "you all would",
|
113 |
+
"y'all'd've": "you all would have",
|
114 |
+
"y'all're": "you all are",
|
115 |
+
"y'all've": "you all have",
|
116 |
+
"you'd": "you would",
|
117 |
+
"you'd've": "you would have",
|
118 |
+
"you'll": "you will",
|
119 |
+
"you'll've": "you will have",
|
120 |
+
"you're": "you are",
|
121 |
+
"you've": "you have"
|
122 |
+
}
|
src/configs/config.cfg
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[EMBEDDINGS]
|
2 |
+
SENTENCE_TRANSFORMER=nomic-ai/nomic-embed-text-v1.5
|
3 |
+
KEYWORD_EXTRACTOR=ml6team/keyphrase-extraction-distilbert-inspec
|
4 |
+
SCORING_EMBED=sentence-transformers/all-MiniLM-L6-v2
|
5 |
+
|
6 |
+
[CHUNKING]
|
7 |
+
CHUNK_SIZE=1000
|
8 |
+
CHUNK_OVERLAP=100
|
9 |
+
|
10 |
+
[ANALYZER]
|
11 |
+
TOP_KEYWORDS=20
|
12 |
+
MAX_KEYWORDS_SIZE=3
|
13 |
+
KEYWORD_MATCH_THRESHOLD=0.75
|
14 |
+
RESUME_SUMMARIZER=facebook/bart-large-cnn
|
15 |
+
RESUME_MAXLENGTH=150
|
16 |
+
RESUME_MINLENGTH=50
|
17 |
+
|
18 |
+
[CANDIDATE]
|
19 |
+
RESUME_MATCH_POINT_THRESHOLD=2
|
20 |
+
SECTION_MATCH_POINT_THRESHOLD=0.4
|
src/configs/stopwords.txt
ADDED
@@ -0,0 +1,758 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a
|
2 |
+
able
|
3 |
+
about
|
4 |
+
above
|
5 |
+
abst
|
6 |
+
accordance
|
7 |
+
according
|
8 |
+
accordingly
|
9 |
+
across
|
10 |
+
act
|
11 |
+
actually
|
12 |
+
added
|
13 |
+
adj
|
14 |
+
affected
|
15 |
+
affecting
|
16 |
+
affects
|
17 |
+
after
|
18 |
+
afterwards
|
19 |
+
again
|
20 |
+
against
|
21 |
+
ah
|
22 |
+
ain't
|
23 |
+
all
|
24 |
+
allow
|
25 |
+
allows
|
26 |
+
almost
|
27 |
+
alone
|
28 |
+
along
|
29 |
+
already
|
30 |
+
also
|
31 |
+
although
|
32 |
+
always
|
33 |
+
am
|
34 |
+
among
|
35 |
+
amongst
|
36 |
+
an
|
37 |
+
and
|
38 |
+
announce
|
39 |
+
another
|
40 |
+
any
|
41 |
+
anybody
|
42 |
+
anyhow
|
43 |
+
anymore
|
44 |
+
anyone
|
45 |
+
anything
|
46 |
+
anyway
|
47 |
+
anyways
|
48 |
+
anywhere
|
49 |
+
apart
|
50 |
+
apparently
|
51 |
+
appear
|
52 |
+
appreciate
|
53 |
+
appropriate
|
54 |
+
approximately
|
55 |
+
are
|
56 |
+
aren
|
57 |
+
arent
|
58 |
+
aren't
|
59 |
+
arise
|
60 |
+
around
|
61 |
+
as
|
62 |
+
a's
|
63 |
+
aside
|
64 |
+
ask
|
65 |
+
asking
|
66 |
+
associated
|
67 |
+
at
|
68 |
+
auth
|
69 |
+
available
|
70 |
+
away
|
71 |
+
awfully
|
72 |
+
b
|
73 |
+
back
|
74 |
+
be
|
75 |
+
became
|
76 |
+
because
|
77 |
+
become
|
78 |
+
becomes
|
79 |
+
becoming
|
80 |
+
been
|
81 |
+
before
|
82 |
+
beforehand
|
83 |
+
begin
|
84 |
+
beginning
|
85 |
+
beginnings
|
86 |
+
begins
|
87 |
+
behind
|
88 |
+
being
|
89 |
+
believe
|
90 |
+
below
|
91 |
+
beside
|
92 |
+
besides
|
93 |
+
best
|
94 |
+
better
|
95 |
+
between
|
96 |
+
beyond
|
97 |
+
biol
|
98 |
+
both
|
99 |
+
brief
|
100 |
+
briefly
|
101 |
+
but
|
102 |
+
by
|
103 |
+
c
|
104 |
+
ca
|
105 |
+
came
|
106 |
+
can
|
107 |
+
cannot
|
108 |
+
cant
|
109 |
+
can't
|
110 |
+
cause
|
111 |
+
causes
|
112 |
+
certain
|
113 |
+
certainly
|
114 |
+
changes
|
115 |
+
clearly
|
116 |
+
c'mon
|
117 |
+
co
|
118 |
+
com
|
119 |
+
come
|
120 |
+
comes
|
121 |
+
concerning
|
122 |
+
consequently
|
123 |
+
consider
|
124 |
+
considering
|
125 |
+
contain
|
126 |
+
containing
|
127 |
+
contains
|
128 |
+
corresponding
|
129 |
+
could
|
130 |
+
couldnt
|
131 |
+
couldn't
|
132 |
+
course
|
133 |
+
c's
|
134 |
+
currently
|
135 |
+
d
|
136 |
+
date
|
137 |
+
definitely
|
138 |
+
described
|
139 |
+
despite
|
140 |
+
did
|
141 |
+
didn't
|
142 |
+
different
|
143 |
+
do
|
144 |
+
does
|
145 |
+
doesn't
|
146 |
+
doing
|
147 |
+
done
|
148 |
+
don't
|
149 |
+
down
|
150 |
+
downwards
|
151 |
+
due
|
152 |
+
during
|
153 |
+
e
|
154 |
+
each
|
155 |
+
ed
|
156 |
+
edu
|
157 |
+
effect
|
158 |
+
eg
|
159 |
+
eight
|
160 |
+
eighty
|
161 |
+
either
|
162 |
+
else
|
163 |
+
elsewhere
|
164 |
+
end
|
165 |
+
ending
|
166 |
+
enough
|
167 |
+
entirely
|
168 |
+
especially
|
169 |
+
et
|
170 |
+
et-al
|
171 |
+
etc
|
172 |
+
even
|
173 |
+
ever
|
174 |
+
every
|
175 |
+
everybody
|
176 |
+
everyone
|
177 |
+
everything
|
178 |
+
everywhere
|
179 |
+
ex
|
180 |
+
exactly
|
181 |
+
example
|
182 |
+
except
|
183 |
+
f
|
184 |
+
far
|
185 |
+
few
|
186 |
+
ff
|
187 |
+
fifth
|
188 |
+
first
|
189 |
+
five
|
190 |
+
fix
|
191 |
+
followed
|
192 |
+
following
|
193 |
+
follows
|
194 |
+
for
|
195 |
+
former
|
196 |
+
formerly
|
197 |
+
forth
|
198 |
+
found
|
199 |
+
four
|
200 |
+
from
|
201 |
+
further
|
202 |
+
furthermore
|
203 |
+
g
|
204 |
+
gave
|
205 |
+
get
|
206 |
+
gets
|
207 |
+
getting
|
208 |
+
give
|
209 |
+
given
|
210 |
+
gives
|
211 |
+
giving
|
212 |
+
go
|
213 |
+
goes
|
214 |
+
going
|
215 |
+
gone
|
216 |
+
got
|
217 |
+
gotten
|
218 |
+
greetings
|
219 |
+
h
|
220 |
+
had
|
221 |
+
hadn't
|
222 |
+
happens
|
223 |
+
hardly
|
224 |
+
has
|
225 |
+
hasn't
|
226 |
+
have
|
227 |
+
haven't
|
228 |
+
having
|
229 |
+
he
|
230 |
+
hed
|
231 |
+
he'd
|
232 |
+
he'll
|
233 |
+
hello
|
234 |
+
help
|
235 |
+
hence
|
236 |
+
her
|
237 |
+
here
|
238 |
+
hereafter
|
239 |
+
hereby
|
240 |
+
herein
|
241 |
+
heres
|
242 |
+
here's
|
243 |
+
hereupon
|
244 |
+
hers
|
245 |
+
herself
|
246 |
+
hes
|
247 |
+
he's
|
248 |
+
hi
|
249 |
+
hid
|
250 |
+
him
|
251 |
+
himself
|
252 |
+
his
|
253 |
+
hither
|
254 |
+
home
|
255 |
+
hopefully
|
256 |
+
how
|
257 |
+
howbeit
|
258 |
+
however
|
259 |
+
how's
|
260 |
+
hundred
|
261 |
+
i
|
262 |
+
id
|
263 |
+
i'd
|
264 |
+
ie
|
265 |
+
if
|
266 |
+
ignored
|
267 |
+
i'll
|
268 |
+
im
|
269 |
+
i'm
|
270 |
+
immediate
|
271 |
+
immediately
|
272 |
+
importance
|
273 |
+
important
|
274 |
+
in
|
275 |
+
inasmuch
|
276 |
+
inc
|
277 |
+
indeed
|
278 |
+
index
|
279 |
+
indicate
|
280 |
+
indicated
|
281 |
+
indicates
|
282 |
+
information
|
283 |
+
inner
|
284 |
+
insofar
|
285 |
+
instead
|
286 |
+
into
|
287 |
+
invention
|
288 |
+
inward
|
289 |
+
is
|
290 |
+
isn't
|
291 |
+
it
|
292 |
+
itd
|
293 |
+
it'd
|
294 |
+
it'll
|
295 |
+
its
|
296 |
+
it's
|
297 |
+
itself
|
298 |
+
i've
|
299 |
+
j
|
300 |
+
just
|
301 |
+
k
|
302 |
+
keep
|
303 |
+
keeps
|
304 |
+
kept
|
305 |
+
kg
|
306 |
+
km
|
307 |
+
know
|
308 |
+
known
|
309 |
+
knows
|
310 |
+
l
|
311 |
+
largely
|
312 |
+
last
|
313 |
+
lately
|
314 |
+
later
|
315 |
+
latter
|
316 |
+
latterly
|
317 |
+
least
|
318 |
+
less
|
319 |
+
lest
|
320 |
+
let
|
321 |
+
lets
|
322 |
+
let's
|
323 |
+
like
|
324 |
+
liked
|
325 |
+
likely
|
326 |
+
line
|
327 |
+
little
|
328 |
+
'll
|
329 |
+
look
|
330 |
+
looking
|
331 |
+
looks
|
332 |
+
ltd
|
333 |
+
m
|
334 |
+
made
|
335 |
+
mainly
|
336 |
+
make
|
337 |
+
makes
|
338 |
+
many
|
339 |
+
may
|
340 |
+
maybe
|
341 |
+
me
|
342 |
+
mean
|
343 |
+
means
|
344 |
+
meantime
|
345 |
+
meanwhile
|
346 |
+
merely
|
347 |
+
mg
|
348 |
+
might
|
349 |
+
million
|
350 |
+
miss
|
351 |
+
ml
|
352 |
+
more
|
353 |
+
moreover
|
354 |
+
most
|
355 |
+
mostly
|
356 |
+
mr
|
357 |
+
mrs
|
358 |
+
much
|
359 |
+
mug
|
360 |
+
must
|
361 |
+
mustn't
|
362 |
+
my
|
363 |
+
myself
|
364 |
+
n
|
365 |
+
na
|
366 |
+
name
|
367 |
+
namely
|
368 |
+
nay
|
369 |
+
nd
|
370 |
+
near
|
371 |
+
nearly
|
372 |
+
necessarily
|
373 |
+
necessary
|
374 |
+
need
|
375 |
+
needs
|
376 |
+
neither
|
377 |
+
never
|
378 |
+
nevertheless
|
379 |
+
new
|
380 |
+
next
|
381 |
+
nine
|
382 |
+
ninety
|
383 |
+
no
|
384 |
+
nobody
|
385 |
+
non
|
386 |
+
none
|
387 |
+
nonetheless
|
388 |
+
noone
|
389 |
+
nor
|
390 |
+
normally
|
391 |
+
nos
|
392 |
+
not
|
393 |
+
noted
|
394 |
+
nothing
|
395 |
+
novel
|
396 |
+
now
|
397 |
+
nowhere
|
398 |
+
o
|
399 |
+
obtain
|
400 |
+
obtained
|
401 |
+
obviously
|
402 |
+
of
|
403 |
+
off
|
404 |
+
often
|
405 |
+
oh
|
406 |
+
ok
|
407 |
+
okay
|
408 |
+
old
|
409 |
+
omitted
|
410 |
+
on
|
411 |
+
once
|
412 |
+
one
|
413 |
+
ones
|
414 |
+
only
|
415 |
+
onto
|
416 |
+
or
|
417 |
+
ord
|
418 |
+
other
|
419 |
+
others
|
420 |
+
otherwise
|
421 |
+
ought
|
422 |
+
our
|
423 |
+
ours
|
424 |
+
ourselves
|
425 |
+
out
|
426 |
+
outside
|
427 |
+
over
|
428 |
+
overall
|
429 |
+
owing
|
430 |
+
own
|
431 |
+
p
|
432 |
+
page
|
433 |
+
pages
|
434 |
+
part
|
435 |
+
particular
|
436 |
+
particularly
|
437 |
+
past
|
438 |
+
per
|
439 |
+
perhaps
|
440 |
+
placed
|
441 |
+
please
|
442 |
+
plus
|
443 |
+
poorly
|
444 |
+
possible
|
445 |
+
possibly
|
446 |
+
potentially
|
447 |
+
pp
|
448 |
+
predominantly
|
449 |
+
present
|
450 |
+
presumably
|
451 |
+
previously
|
452 |
+
primarily
|
453 |
+
probably
|
454 |
+
promptly
|
455 |
+
proud
|
456 |
+
provides
|
457 |
+
put
|
458 |
+
q
|
459 |
+
que
|
460 |
+
quickly
|
461 |
+
quite
|
462 |
+
qv
|
463 |
+
r
|
464 |
+
ran
|
465 |
+
rather
|
466 |
+
rd
|
467 |
+
re
|
468 |
+
readily
|
469 |
+
really
|
470 |
+
reasonably
|
471 |
+
recent
|
472 |
+
recently
|
473 |
+
ref
|
474 |
+
refs
|
475 |
+
regarding
|
476 |
+
regardless
|
477 |
+
regards
|
478 |
+
related
|
479 |
+
relatively
|
480 |
+
research
|
481 |
+
respectively
|
482 |
+
resulted
|
483 |
+
resulting
|
484 |
+
results
|
485 |
+
right
|
486 |
+
run
|
487 |
+
s
|
488 |
+
said
|
489 |
+
same
|
490 |
+
saw
|
491 |
+
say
|
492 |
+
saying
|
493 |
+
says
|
494 |
+
sec
|
495 |
+
second
|
496 |
+
secondly
|
497 |
+
section
|
498 |
+
see
|
499 |
+
seeing
|
500 |
+
seem
|
501 |
+
seemed
|
502 |
+
seeming
|
503 |
+
seems
|
504 |
+
seen
|
505 |
+
self
|
506 |
+
selves
|
507 |
+
sensible
|
508 |
+
sent
|
509 |
+
serious
|
510 |
+
seriously
|
511 |
+
seven
|
512 |
+
several
|
513 |
+
shall
|
514 |
+
shan't
|
515 |
+
she
|
516 |
+
shed
|
517 |
+
she'd
|
518 |
+
she'll
|
519 |
+
shes
|
520 |
+
she's
|
521 |
+
should
|
522 |
+
shouldn't
|
523 |
+
show
|
524 |
+
showed
|
525 |
+
shown
|
526 |
+
showns
|
527 |
+
shows
|
528 |
+
significant
|
529 |
+
significantly
|
530 |
+
similar
|
531 |
+
similarly
|
532 |
+
since
|
533 |
+
six
|
534 |
+
slightly
|
535 |
+
so
|
536 |
+
some
|
537 |
+
somebody
|
538 |
+
somehow
|
539 |
+
someone
|
540 |
+
somethan
|
541 |
+
something
|
542 |
+
sometime
|
543 |
+
sometimes
|
544 |
+
somewhat
|
545 |
+
somewhere
|
546 |
+
soon
|
547 |
+
sorry
|
548 |
+
specifically
|
549 |
+
specified
|
550 |
+
specify
|
551 |
+
specifying
|
552 |
+
still
|
553 |
+
stop
|
554 |
+
strongly
|
555 |
+
sub
|
556 |
+
substantially
|
557 |
+
successfully
|
558 |
+
such
|
559 |
+
sufficiently
|
560 |
+
suggest
|
561 |
+
sup
|
562 |
+
sure
|
563 |
+
t
|
564 |
+
take
|
565 |
+
taken
|
566 |
+
taking
|
567 |
+
tell
|
568 |
+
tends
|
569 |
+
th
|
570 |
+
than
|
571 |
+
thank
|
572 |
+
thanks
|
573 |
+
thanx
|
574 |
+
that
|
575 |
+
that'll
|
576 |
+
thats
|
577 |
+
that's
|
578 |
+
that've
|
579 |
+
the
|
580 |
+
their
|
581 |
+
theirs
|
582 |
+
them
|
583 |
+
themselves
|
584 |
+
then
|
585 |
+
thence
|
586 |
+
there
|
587 |
+
thereafter
|
588 |
+
thereby
|
589 |
+
thered
|
590 |
+
therefore
|
591 |
+
therein
|
592 |
+
there'll
|
593 |
+
thereof
|
594 |
+
therere
|
595 |
+
theres
|
596 |
+
there's
|
597 |
+
thereto
|
598 |
+
thereupon
|
599 |
+
there've
|
600 |
+
these
|
601 |
+
they
|
602 |
+
theyd
|
603 |
+
they'd
|
604 |
+
they'll
|
605 |
+
theyre
|
606 |
+
they're
|
607 |
+
they've
|
608 |
+
think
|
609 |
+
third
|
610 |
+
this
|
611 |
+
thorough
|
612 |
+
thoroughly
|
613 |
+
those
|
614 |
+
thou
|
615 |
+
though
|
616 |
+
thoughh
|
617 |
+
thousand
|
618 |
+
three
|
619 |
+
throug
|
620 |
+
through
|
621 |
+
throughout
|
622 |
+
thru
|
623 |
+
thus
|
624 |
+
til
|
625 |
+
tip
|
626 |
+
to
|
627 |
+
together
|
628 |
+
too
|
629 |
+
took
|
630 |
+
toward
|
631 |
+
towards
|
632 |
+
tried
|
633 |
+
tries
|
634 |
+
truly
|
635 |
+
try
|
636 |
+
trying
|
637 |
+
ts
|
638 |
+
t's
|
639 |
+
twice
|
640 |
+
two
|
641 |
+
u
|
642 |
+
un
|
643 |
+
under
|
644 |
+
unfortunately
|
645 |
+
unless
|
646 |
+
unlike
|
647 |
+
unlikely
|
648 |
+
until
|
649 |
+
unto
|
650 |
+
up
|
651 |
+
upon
|
652 |
+
ups
|
653 |
+
us
|
654 |
+
use
|
655 |
+
used
|
656 |
+
useful
|
657 |
+
usefully
|
658 |
+
usefulness
|
659 |
+
uses
|
660 |
+
using
|
661 |
+
usually
|
662 |
+
v
|
663 |
+
value
|
664 |
+
various
|
665 |
+
've
|
666 |
+
very
|
667 |
+
via
|
668 |
+
viz
|
669 |
+
vol
|
670 |
+
vols
|
671 |
+
vs
|
672 |
+
w
|
673 |
+
want
|
674 |
+
wants
|
675 |
+
was
|
676 |
+
wasnt
|
677 |
+
wasn't
|
678 |
+
way
|
679 |
+
we
|
680 |
+
wed
|
681 |
+
we'd
|
682 |
+
welcome
|
683 |
+
well
|
684 |
+
we'll
|
685 |
+
went
|
686 |
+
were
|
687 |
+
we're
|
688 |
+
werent
|
689 |
+
weren't
|
690 |
+
we've
|
691 |
+
what
|
692 |
+
whatever
|
693 |
+
what'll
|
694 |
+
whats
|
695 |
+
what's
|
696 |
+
when
|
697 |
+
whence
|
698 |
+
whenever
|
699 |
+
when's
|
700 |
+
where
|
701 |
+
whereafter
|
702 |
+
whereas
|
703 |
+
whereby
|
704 |
+
wherein
|
705 |
+
wheres
|
706 |
+
where's
|
707 |
+
whereupon
|
708 |
+
wherever
|
709 |
+
whether
|
710 |
+
which
|
711 |
+
while
|
712 |
+
whim
|
713 |
+
whither
|
714 |
+
who
|
715 |
+
whod
|
716 |
+
whoever
|
717 |
+
whole
|
718 |
+
who'll
|
719 |
+
whom
|
720 |
+
whomever
|
721 |
+
whos
|
722 |
+
who's
|
723 |
+
whose
|
724 |
+
why
|
725 |
+
why's
|
726 |
+
widely
|
727 |
+
will
|
728 |
+
willing
|
729 |
+
wish
|
730 |
+
with
|
731 |
+
within
|
732 |
+
without
|
733 |
+
wonder
|
734 |
+
wont
|
735 |
+
won't
|
736 |
+
words
|
737 |
+
world
|
738 |
+
would
|
739 |
+
wouldnt
|
740 |
+
wouldn't
|
741 |
+
www
|
742 |
+
x
|
743 |
+
y
|
744 |
+
yes
|
745 |
+
yet
|
746 |
+
you
|
747 |
+
youd
|
748 |
+
you'd
|
749 |
+
you'll
|
750 |
+
your
|
751 |
+
youre
|
752 |
+
you're
|
753 |
+
yours
|
754 |
+
yourself
|
755 |
+
yourselves
|
756 |
+
you've
|
757 |
+
z
|
758 |
+
zero
|
src/mains/candidate_job_match.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from src.text.chunking import Chunk
|
3 |
+
from src.utils.compare_metrics import CompareMetrics
|
4 |
+
from src.mains.resume_analyzer import ResumeAnalyzer
|
5 |
+
from src.text.embeddings import SentEmbeddings
|
6 |
+
from src.utils.commonutils import CommonUtils
|
7 |
+
from src.text.text_cleaning import TextCleaner
|
8 |
+
import configparser
|
9 |
+
|
10 |
+
|
11 |
+
config = configparser.ConfigParser()
|
12 |
+
config.read("src/configs/config.cfg")
|
13 |
+
candidate_config = config["CANDIDATE"]
|
14 |
+
|
15 |
+
pointsThreshold = int(candidate_config["RESUME_MATCH_POINT_THRESHOLD"])
|
16 |
+
sectionMatchThreshold = float(candidate_config["SECTION_MATCH_POINT_THRESHOLD"])
|
17 |
+
|
18 |
+
class MatchJobCandidate:
|
19 |
+
|
20 |
+
def __init__(self) -> None:
|
21 |
+
self.compareMetrics = CompareMetrics()
|
22 |
+
self.analyzer = ResumeAnalyzer()
|
23 |
+
self.chunk = Chunk()
|
24 |
+
self.embedding = SentEmbeddings()
|
25 |
+
self.utility = CommonUtils()
|
26 |
+
self.cleaner = TextCleaner()
|
27 |
+
pass
|
28 |
+
|
29 |
+
def __match(self, jdFile, resumeFile):
|
30 |
+
|
31 |
+
metric = 0
|
32 |
+
jdChunkList = self.chunk.chunk(jdFile)
|
33 |
+
resumeChunkList = self.chunk.chunk(resumeFile)
|
34 |
+
|
35 |
+
jdchunkEmbeddings = self.embedding.computeEmbeddingList(jdChunkList)
|
36 |
+
jdresumeEmbeddings = self.embedding.computeEmbeddingList(resumeChunkList)
|
37 |
+
|
38 |
+
total_compare = len(jdchunkEmbeddings) * len(jdresumeEmbeddings)
|
39 |
+
|
40 |
+
for i in range(len(jdchunkEmbeddings)):
|
41 |
+
for j in range(len(jdresumeEmbeddings)):
|
42 |
+
metric += self.compareMetrics.cos_sim(jdchunkEmbeddings[i],jdresumeEmbeddings[j])
|
43 |
+
|
44 |
+
return round((metric*100)/total_compare,2)
|
45 |
+
|
46 |
+
pass
|
47 |
+
|
48 |
+
def __keywordsMatch(self, jdFile, resumeFile):
|
49 |
+
|
50 |
+
jdtext_list = self.chunk.chunk(jdFile)
|
51 |
+
resumeText_list = self.chunk.chunk(resumeFile)
|
52 |
+
|
53 |
+
keywordsJD=[]
|
54 |
+
for jdtext in jdtext_list:
|
55 |
+
keywordsJD.extend(self.analyzer.extractKeywords(jdtext))
|
56 |
+
|
57 |
+
keywordsJD = sorted(list(set(keywordsJD)))
|
58 |
+
|
59 |
+
keywordsRES = []
|
60 |
+
for resumeText in resumeText_list:
|
61 |
+
keywordsRES.extend(self.analyzer.extractKeywords(resumeText))
|
62 |
+
|
63 |
+
keywordsRES = sorted(list(set(keywordsRES)))
|
64 |
+
resumeKey = []
|
65 |
+
for keyword in keywordsRES:
|
66 |
+
if not self.utility.has_numbers(keyword):
|
67 |
+
resumeKey.append(keyword)
|
68 |
+
|
69 |
+
return self.analyzer.keywordsPartialMatch(keywordsJD, keywordsRES), resumeKey
|
70 |
+
pass
|
71 |
+
|
72 |
+
|
73 |
+
def generatePointers(self, jodDescFolder, resumeFolder):
|
74 |
+
jd_list = os.listdir(jodDescFolder)
|
75 |
+
resume_list = os.listdir(resumeFolder)
|
76 |
+
|
77 |
+
jd_dict = dict()
|
78 |
+
|
79 |
+
for jd in jd_list:
|
80 |
+
|
81 |
+
resume_dict = dict()
|
82 |
+
|
83 |
+
for resume in resume_list:
|
84 |
+
jdFile = os.path.join(jodDescFolder, jd)
|
85 |
+
resumeFile = os.path.join(resumeFolder, resume)
|
86 |
+
metric = self.__match(jdFile, resumeFile)
|
87 |
+
resume_dict[resume] = metric
|
88 |
+
|
89 |
+
jd_dict[jd] = {k: v for k, v in sorted(resume_dict.items(), key=lambda item: item[1], reverse=True)}
|
90 |
+
|
91 |
+
return jd_dict
|
92 |
+
pass
|
93 |
+
|
94 |
+
def extractJDResumeKeywords(self, jodDescFolder, resumeFolder):
|
95 |
+
jd_list = os.listdir(jodDescFolder)
|
96 |
+
resume_list = os.listdir(resumeFolder)
|
97 |
+
|
98 |
+
jd_dict = dict()
|
99 |
+
|
100 |
+
for jd in jd_list:
|
101 |
+
|
102 |
+
resume_dict = dict()
|
103 |
+
|
104 |
+
for resume in resume_list:
|
105 |
+
jdFile = os.path.join(jodDescFolder, jd)
|
106 |
+
resumeFile = os.path.join(resumeFolder, resume)
|
107 |
+
resume_dict[resume], resume_dict[resume]["resume_keywords"] = self.__keywordsMatch(jdFile, resumeFile)
|
108 |
+
|
109 |
+
jd_dict[jd] = resume_dict
|
110 |
+
|
111 |
+
return jd_dict
|
112 |
+
pass
|
113 |
+
|
114 |
+
def getJDResumeScore(self, jodDescFolder, resumeFolder):
|
115 |
+
jd_list = os.listdir(jodDescFolder)
|
116 |
+
resume_list = os.listdir(resumeFolder)
|
117 |
+
|
118 |
+
jd_dict = dict()
|
119 |
+
for jd in jd_list:
|
120 |
+
jdText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(jodDescFolder, jd)))
|
121 |
+
resume_dict = dict()
|
122 |
+
for resume in resume_list:
|
123 |
+
resumeText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(resumeFolder, resume)))
|
124 |
+
results = self.compareMetrics.get_score(resumeText, jdText)
|
125 |
+
resume_dict[resume] = results[0].score
|
126 |
+
jd_dict[jd] = resume_dict
|
127 |
+
|
128 |
+
return jd_dict
|
129 |
+
|
130 |
+
pass
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
match = MatchJobCandidate()
|
134 |
+
jodDescFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/JDS"
|
135 |
+
resumeFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/RESUMES"
|
136 |
+
match.run(jodDescFolder, resumeFolder)
|
137 |
+
pass
|
src/mains/resume_analyzer.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.text.text_cleaning import TextCleaner
|
2 |
+
from src.text.embeddings import SentEmbeddings
|
3 |
+
from src.utils.compare_metrics import CompareMetrics
|
4 |
+
import configparser, os
|
5 |
+
from src.text.keywords import KeyphraseExtractionPipeline
|
6 |
+
from src.text.chunking import Chunk
|
7 |
+
from transformers import pipeline
|
8 |
+
|
9 |
+
|
10 |
+
config = configparser.ConfigParser()
|
11 |
+
config.read("src/configs/config.cfg")
|
12 |
+
analyzer_config = config["ANALYZER"]
|
13 |
+
|
14 |
+
topKey = float(analyzer_config["TOP_KEYWORDS"])
|
15 |
+
maxGram = float(analyzer_config["MAX_KEYWORDS_SIZE"])
|
16 |
+
matchThreshold = float(analyzer_config["KEYWORD_MATCH_THRESHOLD"])
|
17 |
+
resume_summarizer = analyzer_config["RESUME_SUMMARIZER"]
|
18 |
+
maxlength = int(analyzer_config["RESUME_MAXLENGTH"])
|
19 |
+
minlength = int(analyzer_config["RESUME_MINLENGTH"])
|
20 |
+
|
21 |
+
class ResumeAnalyzer:
|
22 |
+
|
23 |
+
def __init__(self) -> None:
|
24 |
+
|
25 |
+
self.keywordExtractor = KeyphraseExtractionPipeline()
|
26 |
+
self.cleaning = TextCleaner()
|
27 |
+
self.embeddings = SentEmbeddings()
|
28 |
+
self.compare = CompareMetrics()
|
29 |
+
self.chunk = Chunk(chunksize=1000, overlap=100)
|
30 |
+
self.summarizer = pipeline("summarization", model=resume_summarizer)
|
31 |
+
|
32 |
+
pass
|
33 |
+
|
34 |
+
|
35 |
+
def extractKeywords(self, text):
|
36 |
+
keywords = self.keywordExtractor(text)
|
37 |
+
keylist = []
|
38 |
+
for kw in keywords:
|
39 |
+
keylist.append(self.cleaning.clean_text(kw))
|
40 |
+
|
41 |
+
return keylist
|
42 |
+
pass
|
43 |
+
|
44 |
+
|
45 |
+
def keywordsPartialMatch(self, jdKeywords, resumeKeywords):
|
46 |
+
|
47 |
+
jdKeywords = sorted(list(set(jdKeywords)))
|
48 |
+
resumeKeywords = sorted(list(set(resumeKeywords)))
|
49 |
+
|
50 |
+
jdKeywords_embed = self.embeddings.computeEmbeddingList(jdKeywords)
|
51 |
+
resumeKeywords_embed = self.embeddings.computeEmbeddingList(resumeKeywords)
|
52 |
+
|
53 |
+
match_jd_res_key = dict()
|
54 |
+
|
55 |
+
for i in range(len(jdKeywords)):
|
56 |
+
resKeys = []
|
57 |
+
for j in range(len(resumeKeywords)):
|
58 |
+
metric = self.compare.cos_sim(jdKeywords_embed[i], resumeKeywords_embed[j])
|
59 |
+
if metric > matchThreshold:
|
60 |
+
resKeys.append(resumeKeywords[j])
|
61 |
+
|
62 |
+
if resKeys:
|
63 |
+
match_jd_res_key[jdKeywords[i]] = resKeys
|
64 |
+
|
65 |
+
return match_jd_res_key
|
66 |
+
pass
|
67 |
+
|
68 |
+
|
69 |
+
def __summarizeBatch(self, textBatch):
|
70 |
+
return self.summarizer(textBatch, max_length=maxlength, min_length=minlength, do_sample=False)
|
71 |
+
pass
|
72 |
+
|
73 |
+
def resumeBatchSummarizer(self, resumeFolder):
|
74 |
+
resume_list = os.listdir(resumeFolder)
|
75 |
+
|
76 |
+
resumeSummarize = dict()
|
77 |
+
|
78 |
+
for resumeFile in resume_list:
|
79 |
+
file = os.path.join(resumeFolder, resumeFile)
|
80 |
+
resumeChunk_list = self.chunk.chunk(file)
|
81 |
+
response = self.__summarizeBatch(resumeChunk_list)
|
82 |
+
print(response)
|
83 |
+
summarize = ""
|
84 |
+
for summary in response:
|
85 |
+
summarize += " "+str(summary['summary_text'])
|
86 |
+
resumeSummarize[resumeFile] = summarize
|
87 |
+
|
88 |
+
return resumeSummarize
|
89 |
+
pass
|
90 |
+
|
91 |
+
pass
|
src/mains/resume_metadata.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re, os
|
3 |
+
from pdfminer.high_level import extract_text
|
4 |
+
import spacy
|
5 |
+
from spacy.matcher import Matcher
|
6 |
+
from src.utils.commonutils import CommonUtils
|
7 |
+
from src.mains.resume_analyzer import ResumeAnalyzer
|
8 |
+
|
9 |
+
class ResumeMetaData():
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
self.utils = CommonUtils()
|
13 |
+
self.analyzer = ResumeAnalyzer()
|
14 |
+
pass
|
15 |
+
|
16 |
+
|
17 |
+
def extract_text_from_pdf(self, pdf_path):
|
18 |
+
return extract_text(pdf_path)
|
19 |
+
|
20 |
+
|
21 |
+
def extract_contact_number_from_resume(self, text):
|
22 |
+
contact_number = None
|
23 |
+
# Use regex pattern to find a potential contact number
|
24 |
+
pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
|
25 |
+
tmp = re.findall(pattern,text)
|
26 |
+
r1 = '[^0-9]+'
|
27 |
+
contact_number_list = []
|
28 |
+
for con in tmp:
|
29 |
+
contact_number_list.append(re.sub(r1, "", con)[-10:])
|
30 |
+
|
31 |
+
contact_number = ", ".join(contact_number_list)
|
32 |
+
|
33 |
+
return contact_number
|
34 |
+
|
35 |
+
|
36 |
+
def extract_email_from_resume(self, text):
|
37 |
+
email = None
|
38 |
+
# Use regex pattern to find a potential email address
|
39 |
+
pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
|
40 |
+
email = ", ".join(re.findall(pattern,text))
|
41 |
+
return email
|
42 |
+
|
43 |
+
|
44 |
+
def extract_education_from_resume(self, text):
|
45 |
+
education = []
|
46 |
+
|
47 |
+
# Use regex pattern to find education information
|
48 |
+
pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
|
49 |
+
matches = re.findall(pattern, text)
|
50 |
+
for match in matches:
|
51 |
+
education.append(match.strip())
|
52 |
+
|
53 |
+
return education
|
54 |
+
|
55 |
+
|
56 |
+
def extract_name(self, resume_text):
|
57 |
+
nlp = spacy.load('en_core_web_lg')
|
58 |
+
matcher = Matcher(nlp.vocab)
|
59 |
+
|
60 |
+
# Define name patterns
|
61 |
+
patterns = [
|
62 |
+
[{'POS': 'PROPN'}, {'POS': 'PROPN'}], # First name and Last name
|
63 |
+
[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}], # First name, Middle name, and Last name
|
64 |
+
[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}] # First name, Middle name, Middle name, and Last name
|
65 |
+
# Add more patterns as needed
|
66 |
+
]
|
67 |
+
|
68 |
+
for pattern in patterns:
|
69 |
+
matcher.add('NAME', patterns=[pattern])
|
70 |
+
|
71 |
+
doc = nlp(resume_text)
|
72 |
+
matches = matcher(doc)
|
73 |
+
|
74 |
+
for match_id, start, end in matches:
|
75 |
+
span = doc[start:end]
|
76 |
+
return span.text
|
77 |
+
|
78 |
+
return None
|
79 |
+
|
80 |
+
def extract_links_extended(self, text):
|
81 |
+
links = []
|
82 |
+
pattern = r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b'
|
83 |
+
links = re.findall(pattern, text)
|
84 |
+
for link in links:
|
85 |
+
if "/" not in link:
|
86 |
+
links.remove(link)
|
87 |
+
return links
|
88 |
+
|
89 |
+
|
90 |
+
def extract_keywords(self, text):
|
91 |
+
return self.analyzer.extractKeywords(text)
|
92 |
+
|
93 |
+
def extractMetaData(self, resumeFolder):
|
94 |
+
|
95 |
+
resume_list = os.listdir(resumeFolder)
|
96 |
+
resume_info = dict()
|
97 |
+
|
98 |
+
for resume in resume_list:
|
99 |
+
print(resume)
|
100 |
+
meta_data = dict()
|
101 |
+
resume_path = os.path.join(resumeFolder, resume)
|
102 |
+
text = self.extract_text_from_pdf(resume_path)
|
103 |
+
|
104 |
+
|
105 |
+
name = self.extract_name(text)
|
106 |
+
if name:
|
107 |
+
meta_data["Name"] = name
|
108 |
+
else:
|
109 |
+
meta_data["Name"] = ""
|
110 |
+
|
111 |
+
|
112 |
+
contact_number = self.extract_contact_number_from_resume(text)
|
113 |
+
if contact_number:
|
114 |
+
meta_data["Contact Number"] = contact_number
|
115 |
+
else:
|
116 |
+
meta_data["Contact Number"] = ""
|
117 |
+
|
118 |
+
|
119 |
+
email = self.extract_email_from_resume(text)
|
120 |
+
if email:
|
121 |
+
meta_data["Email"] = email
|
122 |
+
else:
|
123 |
+
print("Email not found")
|
124 |
+
|
125 |
+
|
126 |
+
extracted_education = self.extract_education_from_resume(text)
|
127 |
+
if extracted_education:
|
128 |
+
meta_data["Education"] = extracted_education
|
129 |
+
else:
|
130 |
+
meta_data["Education"] = ""
|
131 |
+
|
132 |
+
|
133 |
+
extracted_links = self.extract_links_extended(text)
|
134 |
+
if extracted_education:
|
135 |
+
meta_data["Links"] = extracted_links
|
136 |
+
else:
|
137 |
+
meta_data["Links"] = ""
|
138 |
+
|
139 |
+
|
140 |
+
extracted_keywords = self.extract_keywords(text)
|
141 |
+
if extracted_keywords:
|
142 |
+
meta_data["Skills"] = extracted_keywords
|
143 |
+
else:
|
144 |
+
meta_data["Skills"] = ""
|
145 |
+
|
146 |
+
|
147 |
+
resume_info[resume] = meta_data
|
148 |
+
|
149 |
+
return resume_info
|
150 |
+
|
151 |
+
pass
|
152 |
+
|
153 |
+
|
154 |
+
if __name__ == '__main__':
|
155 |
+
|
156 |
+
resumeFolder = "D:/Study Material/Projects/HR Assist/Code/test_data/RESUMES"
|
157 |
+
|
158 |
+
metadata = ResumeMetaData()
|
159 |
+
info = metadata.extractMetaData(resumeFolder)
|
160 |
+
|
161 |
+
print(info)
|
162 |
+
|
163 |
+
pass
|
src/text/chunking.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
from semantic_text_splitter import TextSplitter
|
3 |
+
import configparser
|
4 |
+
|
5 |
+
config = configparser.ConfigParser()
|
6 |
+
config.read("src/configs/config.cfg")
|
7 |
+
chunk_config = config["CHUNKING"]
|
8 |
+
|
9 |
+
|
10 |
+
class Chunk:
|
11 |
+
def __init__(self, chunksize=int(chunk_config["CHUNK_SIZE"]), overlap=int(chunk_config["CHUNK_OVERLAP"])) -> None:
|
12 |
+
self.splitter = TextSplitter(capacity=chunksize, overlap=overlap)
|
13 |
+
|
14 |
+
def chunk(self, inputFileLoc) -> list:
|
15 |
+
doc = fitz.open(inputFileLoc)
|
16 |
+
|
17 |
+
text = ""
|
18 |
+
for page in doc:
|
19 |
+
text += " "+ page.get_text()
|
20 |
+
|
21 |
+
chunks = self.splitter.chunks(text)
|
22 |
+
|
23 |
+
return chunks
|
24 |
+
|
25 |
+
def getTextFromPdf(self, inputFileLoc) -> list:
|
26 |
+
doc = fitz.open(inputFileLoc)
|
27 |
+
|
28 |
+
text = ""
|
29 |
+
for page in doc:
|
30 |
+
text += " "+ page.get_text()
|
31 |
+
|
32 |
+
return text
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
input_file = '../test_data/RESUMES/AnanyaDasResume.pdf'
|
39 |
+
chunker = Chunk()
|
40 |
+
print(chunker.chunk(input_file))
|
src/text/embeddings.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from src.text.text_cleaning import TextCleaner
|
3 |
+
import configparser
|
4 |
+
|
5 |
+
config = configparser.ConfigParser()
|
6 |
+
config.read("src/configs/config.cfg")
|
7 |
+
embed_config = config["EMBEDDINGS"]
|
8 |
+
|
9 |
+
|
10 |
+
class SentEmbeddings():
|
11 |
+
|
12 |
+
def __init__(self) -> None:
|
13 |
+
self.model = SentenceTransformer(embed_config['SENTENCE_TRANSFORMER'], trust_remote_code=True, device='cuda')
|
14 |
+
pass
|
15 |
+
|
16 |
+
def computeEmbedding(self, sentence):
|
17 |
+
cleaner = TextCleaner()
|
18 |
+
clean_sent = cleaner.clean_text(sentence)
|
19 |
+
return self.model.encode(clean_sent)
|
20 |
+
pass
|
21 |
+
|
22 |
+
def computeEmbeddingList(self, sentenceList):
|
23 |
+
cleaner = TextCleaner()
|
24 |
+
cleaned_sentList = []
|
25 |
+
for i in range(len(sentenceList)):
|
26 |
+
cleaned_sentList.append(cleaner.clean_text(sentenceList[i]))
|
27 |
+
return self.model.encode(cleaned_sentList)
|
28 |
+
pass
|
29 |
+
|
30 |
+
pass
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
embed = SentEmbeddings()
|
35 |
+
test_sent = """This isn't a panda,,,, you are wrong this is a well versed bear ..
|
36 |
+
which you'll never understand!!!!!!!!!!!!!!!!"""
|
37 |
+
embedding = embed.computeEmbedding(test_sent)
|
38 |
+
print(embedding)
|
39 |
+
pass
|
src/text/keywords.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import TokenClassificationPipeline, AutoModelForTokenClassification, AutoTokenizer
|
2 |
+
from transformers.pipelines import AggregationStrategy
|
3 |
+
import numpy as np
|
4 |
+
import configparser
|
5 |
+
|
6 |
+
config = configparser.ConfigParser()
|
7 |
+
config.read("src/configs/config.cfg")
|
8 |
+
embed_config = config["EMBEDDINGS"]
|
9 |
+
|
10 |
+
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
11 |
+
|
12 |
+
def __init__(self,):
|
13 |
+
super().__init__(
|
14 |
+
model=AutoModelForTokenClassification.from_pretrained(str(embed_config["KEYWORD_EXTRACTOR"])),
|
15 |
+
tokenizer=AutoTokenizer.from_pretrained(embed_config["KEYWORD_EXTRACTOR"], device_map = 'cuda')
|
16 |
+
)
|
17 |
+
|
18 |
+
def postprocess(self, all_outputs):
|
19 |
+
results = super().postprocess(
|
20 |
+
all_outputs=all_outputs,
|
21 |
+
aggregation_strategy=AggregationStrategy.FIRST,
|
22 |
+
)
|
23 |
+
return np.unique([result.get("word").strip() for result in results])
|
src/text/text_cleaning.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from nltk.stem import WordNetLemmatizer
|
3 |
+
from src.utils.commonutils import CommonUtils
|
4 |
+
|
5 |
+
class TextCleaner:
|
6 |
+
|
7 |
+
def __init__(self) -> None:
|
8 |
+
self.lemmatizer = WordNetLemmatizer()
|
9 |
+
self.comonUtils = CommonUtils()
|
10 |
+
self.stopwords = self.comonUtils.loadStropwords()
|
11 |
+
self.abbr_words = self.comonUtils.loadAbbreviations()
|
12 |
+
pass
|
13 |
+
|
14 |
+
def __remove_html_tags(self, text):
|
15 |
+
clean_text = re.sub(r'<.*?>', '', text)
|
16 |
+
return clean_text
|
17 |
+
|
18 |
+
def __remove_special_characters(self, text):
|
19 |
+
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
20 |
+
return clean_text
|
21 |
+
|
22 |
+
def __convert_to_lowercase(self, text):
|
23 |
+
lowercased_text = text.lower()
|
24 |
+
return lowercased_text
|
25 |
+
|
26 |
+
def __change_abbr(self, text):
|
27 |
+
abbreviation = ' '.join([self.abbr_words[t] if t in self.abbr_words else t for t in text.split(" ")])
|
28 |
+
return abbreviation
|
29 |
+
|
30 |
+
def __remove_whitespace(self, text):
|
31 |
+
cleaned_text = ' '.join(text.split())
|
32 |
+
return cleaned_text
|
33 |
+
|
34 |
+
def __lemmatize_text(self, tokens):
|
35 |
+
lemmatized_tokens = ' '.join([self.lemmatizer.lemmatize(word) for word in tokens.split()])
|
36 |
+
return lemmatized_tokens
|
37 |
+
|
38 |
+
def remove_stopwords(self, tokens):
|
39 |
+
filtered_tokens = ' '.join([word for word in tokens.split() if word not in self.stopwords])
|
40 |
+
return filtered_tokens
|
41 |
+
|
42 |
+
def remove_numbers(self, text):
|
43 |
+
result = re.sub(r'[0-9]+', ' ', text)
|
44 |
+
result = self.__remove_whitespace(result)
|
45 |
+
return result
|
46 |
+
|
47 |
+
def clean_text(self, text):
|
48 |
+
sentence = self.__remove_html_tags(text)
|
49 |
+
sentence = self.__change_abbr(sentence)
|
50 |
+
sentence = self.__lemmatize_text(sentence)
|
51 |
+
sentence = self.__remove_special_characters(sentence)
|
52 |
+
sentence = self.__convert_to_lowercase(sentence)
|
53 |
+
sentence = self.__remove_whitespace(sentence)
|
54 |
+
return sentence
|
55 |
+
pass
|
src/utils/commonutils.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, json, re
|
2 |
+
from datetime import datetime
|
3 |
+
from dateutil import relativedelta
|
4 |
+
|
5 |
+
class CommonUtils:
|
6 |
+
|
7 |
+
def __init__(self) -> None:
|
8 |
+
pass
|
9 |
+
|
10 |
+
def loadStropwords(self,):
|
11 |
+
with open(os.path.join("src", "configs", "stopwords.txt"), "r") as g:
|
12 |
+
stopwords = g.read().splitlines()
|
13 |
+
return stopwords
|
14 |
+
|
15 |
+
def loadAbbreviations(self,):
|
16 |
+
with open(os.path.join("src", "configs", "abbr.json"), "r") as json_file:
|
17 |
+
data = json.load(json_file)
|
18 |
+
return data
|
19 |
+
|
20 |
+
def has_numbers(self, inputString):
|
21 |
+
return bool(re.search(r'\d', inputString))
|
22 |
+
|
23 |
+
|
24 |
+
def get_number_of_months_from_dates(date1, date2):
|
25 |
+
if date2.lower() == 'present':
|
26 |
+
date2 = datetime.now().strftime('%b %Y')
|
27 |
+
try:
|
28 |
+
if len(date1.split()[0]) > 3:
|
29 |
+
date1 = date1.split()
|
30 |
+
date1 = date1[0][:3] + ' ' + date1[1]
|
31 |
+
if len(date2.split()[0]) > 3:
|
32 |
+
date2 = date2.split()
|
33 |
+
date2 = date2[0][:3] + ' ' + date2[1]
|
34 |
+
except IndexError:
|
35 |
+
return 0
|
36 |
+
try:
|
37 |
+
date1 = datetime.strptime(str(date1), '%b %Y')
|
38 |
+
date2 = datetime.strptime(str(date2), '%b %Y')
|
39 |
+
months_of_experience = relativedelta.relativedelta(date2, date1)
|
40 |
+
months_of_experience = (months_of_experience.years
|
41 |
+
* 12 + months_of_experience.months)
|
42 |
+
except ValueError:
|
43 |
+
return 0
|
44 |
+
return months_of_experience
|
45 |
+
|
46 |
+
pass
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
|
53 |
+
cu = CommonUtils()
|
54 |
+
print(type(cu.loadAbbreviations()))
|
55 |
+
print(cu.loadAbbreviations())
|
56 |
+
|
57 |
+
|
58 |
+
pass
|
src/utils/compare_metrics.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from sentence_transformers import util
|
3 |
+
from src.text.embeddings import SentEmbeddings
|
4 |
+
from src.text.text_cleaning import TextCleaner
|
5 |
+
from typing import List
|
6 |
+
from qdrant_client import QdrantClient
|
7 |
+
import configparser
|
8 |
+
|
9 |
+
config = configparser.ConfigParser()
|
10 |
+
config.read("src/configs/config.cfg")
|
11 |
+
embed_config = config["EMBEDDINGS"]
|
12 |
+
|
13 |
+
class CompareMetrics:
|
14 |
+
|
15 |
+
def __init__(self) -> None:
|
16 |
+
self.sentEmbedding = SentEmbeddings()
|
17 |
+
self.textCleaner = TextCleaner()
|
18 |
+
pass
|
19 |
+
|
20 |
+
def dot_score(self, emb1, emb2):
|
21 |
+
return round(util.dot_score(emb1, emb2).numpy()[0][0].tolist(),2)
|
22 |
+
|
23 |
+
def cos_sim(self, emb1, emb2):
|
24 |
+
return round(util.cos_sim(emb1, emb2).numpy()[0][0].tolist(),2)
|
25 |
+
|
26 |
+
def calculate_similarity(self, sent1, sent2):
|
27 |
+
metrics = dict()
|
28 |
+
cleaned_sent1 = self.textCleaner.clean_text(sent1)
|
29 |
+
cleaned_sent2 = self.textCleaner.clean_text(sent2)
|
30 |
+
|
31 |
+
emb1 = self.sentEmbedding.computeEmbedding(cleaned_sent1)
|
32 |
+
emb2 = self.sentEmbedding.computeEmbedding(cleaned_sent2)
|
33 |
+
metrics['dot_score'] = self.dot_score(emb1, emb2)
|
34 |
+
metrics['cos_sim'] = self.cos_sim(emb1, emb2)
|
35 |
+
|
36 |
+
## sending only cos_sim as both are same
|
37 |
+
return metrics['cos_sim']
|
38 |
+
|
39 |
+
|
40 |
+
def get_score(self, resume_string, job_description_string):
|
41 |
+
|
42 |
+
documents: List[str] = [resume_string]
|
43 |
+
client = QdrantClient(":memory:")
|
44 |
+
client.set_model(embed_config['SCORING_EMBED'])
|
45 |
+
|
46 |
+
client.add(
|
47 |
+
collection_name="demo_collection",
|
48 |
+
documents=documents,
|
49 |
+
)
|
50 |
+
|
51 |
+
search_result = client.query(
|
52 |
+
collection_name="demo_collection", query_text=job_description_string
|
53 |
+
)
|
54 |
+
|
55 |
+
return search_result
|
56 |
+
|
57 |
+
pass
|
src/utils/scout.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import google.generativeai as genai
|
2 |
+
import textwrap
|
3 |
+
|
4 |
+
def to_markdown(text):
|
5 |
+
text = text.replace('•', ' *')
|
6 |
+
return textwrap.indent(text, '> ', predicate=lambda _: True)
|
7 |
+
|
8 |
+
GOOGLE_API_KEY="AIzaSyDXgb_tauJ6Au_puSi0Lqht1nRuFskOkHQ" #userdata.get('GOOGLE_API_KEY')
|
9 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
10 |
+
|
11 |
+
|
12 |
+
for m in genai.list_models():
|
13 |
+
if 'generateContent' in m.supported_generation_methods:
|
14 |
+
print(m.name)
|
15 |
+
|
16 |
+
|
17 |
+
model = genai.GenerativeModel('gemini-pro',)
|
18 |
+
|
19 |
+
response = model.generate_content("What is the meaning of life?")
|
20 |
+
|
21 |
+
print(response.text)
|
static/scripts.js
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
|
4 |
+
document.getElementById('compare-button').addEventListener('click', function() {
|
5 |
+
var jdfiles = document.getElementById('jd');
|
6 |
+
var resfiles = document.getElementById('resume');
|
7 |
+
|
8 |
+
document.getElementById('comparison-output').innerText = 'Genarating Response..';
|
9 |
+
|
10 |
+
if (jdfiles.value.length < 1 || resfiles.value.length < 1) {
|
11 |
+
alert("Please select pdf to upload..");
|
12 |
+
return false;
|
13 |
+
}
|
14 |
+
else if(jdfiles.files.length > 1){
|
15 |
+
alert("Max 1 file can be uploaded in JD.");
|
16 |
+
return false;
|
17 |
+
}
|
18 |
+
else if(resfiles.files.length > 5){
|
19 |
+
alert("Max 5 files can be uploaded in Resume.");
|
20 |
+
return false;
|
21 |
+
}
|
22 |
+
|
23 |
+
const formData = new FormData();
|
24 |
+
|
25 |
+
for (var x = 0; x < jdfiles.files.length; x++) {
|
26 |
+
formData.append("jdfiles", jdfiles.files[x]);
|
27 |
+
}
|
28 |
+
|
29 |
+
for (var x = 0; x < resfiles.files.length; x++) {
|
30 |
+
formData.append("resfiles", resfiles.files[x]);
|
31 |
+
}
|
32 |
+
|
33 |
+
formData.append('jdfiles', jdfiles.files[0]);
|
34 |
+
formData.append('resfiles', resfiles.files[0]);
|
35 |
+
|
36 |
+
fetch('http://127.0.0.1:8080/summarize_resume', {
|
37 |
+
method: 'POST',
|
38 |
+
body: formData
|
39 |
+
})
|
40 |
+
.then(response => response.json())
|
41 |
+
.then(data => {
|
42 |
+
document.getElementById('comparison-output').innerText = JSON.stringify(data, null, 2);
|
43 |
+
})
|
44 |
+
.catch(error => {
|
45 |
+
console.error('Error:', error);
|
46 |
+
document.getElementById('comparison-output').innerText = 'An error occurred during comparison.';
|
47 |
+
});
|
48 |
+
});
|
49 |
+
|
50 |
+
|
51 |
+
document.getElementById('clear-button').addEventListener('click', function() {
|
52 |
+
document.getElementById('upload-form').reset();
|
53 |
+
document.getElementById('comparison-output').innerText = '';
|
54 |
+
});
|
static/styles.css
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
3 |
+
background: linear-gradient(0.25turn, #3f87a6, #ebf8e1, #f69d3c);
|
4 |
+
margin: 0;
|
5 |
+
padding: 0;
|
6 |
+
display: flex;
|
7 |
+
justify-content: center;
|
8 |
+
align-items: center;
|
9 |
+
height: 100vh;
|
10 |
+
color: #fff;
|
11 |
+
}
|
12 |
+
|
13 |
+
.container {
|
14 |
+
background: #fff;
|
15 |
+
padding: 40px 50px;
|
16 |
+
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
|
17 |
+
border-radius: 10px;
|
18 |
+
text-align: center;
|
19 |
+
width: 90%;
|
20 |
+
max-width: 500px;
|
21 |
+
color: #333;
|
22 |
+
}
|
23 |
+
|
24 |
+
h1 {
|
25 |
+
margin-bottom: 25px;
|
26 |
+
font-size: 2em;
|
27 |
+
color: #800020;
|
28 |
+
}
|
29 |
+
|
30 |
+
.file-input {
|
31 |
+
margin-bottom: 20px;
|
32 |
+
}
|
33 |
+
|
34 |
+
label {
|
35 |
+
display: block;
|
36 |
+
margin-bottom: 10px;
|
37 |
+
font-weight: bold;
|
38 |
+
font-size: 1.1em;
|
39 |
+
color: #800020;
|
40 |
+
}
|
41 |
+
|
42 |
+
input[type="file"] {
|
43 |
+
width: 100%;
|
44 |
+
padding: 10px;
|
45 |
+
border: 2px solid #ddd;
|
46 |
+
border-radius: 5px;
|
47 |
+
font-size: 1em;
|
48 |
+
transition: border-color 0.3s ease;
|
49 |
+
}
|
50 |
+
|
51 |
+
input[type="file"]:focus {
|
52 |
+
border-color: #800020;
|
53 |
+
outline: none;
|
54 |
+
}
|
55 |
+
|
56 |
+
button {
|
57 |
+
padding: 12px 25px;
|
58 |
+
background: #800020;
|
59 |
+
color: #fff;
|
60 |
+
border: none;
|
61 |
+
border-radius: 25px;
|
62 |
+
cursor: pointer;
|
63 |
+
font-size: 1em;
|
64 |
+
transition: background 0.3s ease, transform 0.3s ease;
|
65 |
+
box-shadow: 0 5px 15px rgba(128, 0, 32, 0.2);
|
66 |
+
}
|
67 |
+
|
68 |
+
button:hover {
|
69 |
+
background: #4c0014;
|
70 |
+
transform: translateY(-2px);
|
71 |
+
}
|
72 |
+
|
73 |
+
#results {
|
74 |
+
margin-top: 35px;
|
75 |
+
}
|
76 |
+
|
77 |
+
#comparison-output {
|
78 |
+
padding: 20px;
|
79 |
+
background: #f1f1f1;
|
80 |
+
border-radius: 5px;
|
81 |
+
box-shadow: inset 0 0 10px rgba(0, 0, 0, 0.1);
|
82 |
+
text-align: left;
|
83 |
+
white-space: pre-wrap;
|
84 |
+
}
|
templates/index.html
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Talent Scout AI</title>
|
7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div class="container">
|
11 |
+
<h1>Talent Scout AI</h1>
|
12 |
+
<form id="upload-form">
|
13 |
+
<div class="file-input">
|
14 |
+
<label for="jd">Upload JD:</label>
|
15 |
+
<input type="file" id="jd" accept="application/pdf" required multiple>
|
16 |
+
</div>
|
17 |
+
<div class="file-input">
|
18 |
+
<label for="resume">Upload RESUME:</label>
|
19 |
+
<input type="file" id="resume" accept="application/pdf" required multiple>
|
20 |
+
</div>
|
21 |
+
<button type="button" id="compare-button">Compare</button>
|
22 |
+
<button type="button" id="clear-button">Clear All</button>
|
23 |
+
</form>
|
24 |
+
<div id="results">
|
25 |
+
<h2>Comparison Results</h2>
|
26 |
+
<div id="comparison-output" style="overflow-y: scroll; height:200px;"></div>
|
27 |
+
</div>
|
28 |
+
</div>
|
29 |
+
<script src="{{ url_for('static', filename='scripts.js') }}"></script>
|
30 |
+
</body>
|
31 |
+
</html>
|