Spaces:

bishalbose294
/

talentScoutAI

Sleeping

App Files Files

bishalbose294 commited on Jul 17, 2024

Commit

775f69c

1 Parent(s): a7e7c48

initial commit

Browse files

Files changed (21) hide show

.gitignore +5 -0
Dockerfile +22 -0
README.md +1 -11
app.py +103 -0
requirements.txt +10 -0
src/configs/abbr.json +122 -0
src/configs/config.cfg +20 -0
src/configs/stopwords.txt +758 -0
src/mains/candidate_job_match.py +137 -0
src/mains/resume_analyzer.py +91 -0
src/mains/resume_metadata.py +163 -0
src/text/chunking.py +40 -0
src/text/embeddings.py +39 -0
src/text/keywords.py +23 -0
src/text/text_cleaning.py +55 -0
src/utils/commonutils.py +58 -0
src/utils/compare_metrics.py +57 -0
src/utils/scout.py +21 -0
static/scripts.js +54 -0
static/styles.css +84 -0
templates/index.html +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*env/
+*pycache*/
+*test_data/
+uploads/*
+test_file.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.10.11
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+COPY ./packages.txt /code/packages.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+ENV TRANSFORMERS_CACHE=/code/hf_model
+ENV HF_HOME=/code/hf_model
+ENV HF_DATASETS_CACHE=/code/hf_model
+ENV XDG_CACHE_HOME=/code/hf_model
+RUN chmod -R 777 .
+EXPOSE 7860
+CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1 @@
----
-title: TalentScoutAI
-emoji: 🌖
-colorFrom: indigo
-colorTo: gray
-sdk: docker
-pinned: false
-license: unknown
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Talent-Scout-AI

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from flask import Flask, redirect, url_for, render_template, request, jsonify
+from flask_cors import CORS
+import simplejson as json
+import os, time, traceback
+import shutil
+from src.mains.candidate_job_match import MatchJobCandidate
+from src.mains.resume_analyzer import ResumeAnalyzer
+from gevent.pywsgi import WSGIServer
+app = Flask(__name__)
+CORS(app=app)
+cwd = os.getcwd()
+app.config["ALLOWED_EXTENSIONS"] = [".pdf"]
+app.config["MAX_CONTENT_LENGTH"] = 25 * 1024 * 1024 # 25 MB
+app.config["UPLOAD_FOLDER"] = os.path.join(cwd, "uploads")
+methods = ['GET','POST']
+def home():
+   return render_template('index.html')
+app.add_url_rule('/', 'home', home, methods=methods)
+def calculate_scores():
+   try:
+      timestr = time.strftime("%Y%m%d_%H%M%S")
+      jds_folder = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"jds")
+      os.makedirs(jds_folder)
+      res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
+      os.makedirs(res_foler)
+      jdfiles = request.files.getlist("jdfiles")
+      for file in jdfiles:
+         filePath = os.path.join(jds_folder, file.filename)
+         file.save(filePath)
+      resumefiles = request.files.getlist("resfiles")
+      for file in resumefiles:
+         filePath = os.path.join(res_foler, file.filename)
+         file.save(filePath)
+      match = MatchJobCandidate()
+      pointers = match.generatePointers(jds_folder, res_foler)
+      keywords = match.extractJDResumeKeywords(jds_folder, res_foler)
+      final_dict = dict()
+      for jd, resumePointers in pointers.items():
+         temp_dict = dict()
+         for resume, points in resumePointers.items():
+            temp_dict[resume] = {
+               'points' : points,
+               'keywords' : keywords[jd][resume],
+            }
+         final_dict[jd] = temp_dict
+      return json.dumps(final_dict)
+   except Exception as ex:
+      print("Exception: ",ex.with_traceback)
+      print(traceback.format_exc())
+      return jsonify({"error": str(ex)})
+   finally:
+      shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
+app.add_url_rule("/calculate_scores", 'calculate_scores', calculate_scores, methods=methods)
+def summarize_resume():
+   try:
+      timestr = time.strftime("%Y%m%d_%H%M%S")
+      res_foler = os.path.join(app.config["UPLOAD_FOLDER"],timestr,"resumes")
+      os.makedirs(res_foler)
+      resumefiles = request.files.getlist("resfiles")
+      for file in resumefiles:
+         filePath = os.path.join(res_foler, file.filename)
+         file.save(filePath)
+      resumeAnalyze = ResumeAnalyzer()
+      response = resumeAnalyze.resumeBatchSummarizer(res_foler)
+      return json.dumps(response)
+   except Exception as ex:
+      print("Exception: ",ex.with_traceback)
+      print(traceback.format_exc())
+      return jsonify({"error": str(ex)})
+   finally:
+      shutil.rmtree(os.path.join(app.config["UPLOAD_FOLDER"],timestr), ignore_errors=False,)
+   pass
+app.add_url_rule("/summarize_resume", 'summarize_resume', summarize_resume, methods=methods)
+if __name__ == '__main__':
+    host = '0.0.0.0'
+    port = 7860
+    print("#"*50,"--Application Serving Now--","#"*50)
+    # app.run(host=host,port=port)
+    app_serve = WSGIServer((host,port),app)
+    app_serve.serve_forever()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+pymupdf
+Flask==2.3.2
+Flask_Cors==4.0.0
+nltk==3.8.1
+protobuf==3.19.3
+semantic_text_splitter==0.13.1
+sentence_transformers==2.2.2
+simplejson==3.19.1
+transformers
+gevent

src/configs/abbr.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+    "ain't": "is not",
+    "aren't": "are not",
+    "can't": "cannot",
+    "'cause": "because",
+    "could've": "could have",
+    "couldn't": "could not",
+    "didn't": "did not",
+    "doesn't": "does not",
+    "don't": "do not",
+    "hadn't": "had not",
+    "hasn't": "has not",
+    "haven't": "have not",
+    "he'd": "he would",
+    "he'll": "he will",
+    "he's": "he is",
+    "how'd": "how did",
+    "how'd'y": "how do you",
+    "how'll": "how will",
+    "how's": "how is",
+    "I'd": "I would",
+    "I'd've": "I would have",
+    "I'll": "I will",
+    "I'll've": "I will have",
+    "I'm": "I am",
+    "I've": "I have",
+    "i'd": "i would",
+    "i'd've": "i would have",
+    "i'll": "i will",
+    "i'll've": "i will have",
+    "i'm": "i am",
+    "i've": "i have",
+    "isn't": "is not",
+    "it'd": "it would",
+    "it'd've": "it would have",
+    "it'll": "it will",
+    "it'll've": "it will have",
+    "it's": "it is",
+    "let's": "let us",
+    "ma'am": "madam",
+    "mayn't": "may not",
+    "might've": "might have",
+    "mightn't": "might not",
+    "mightn't've": "might not have",
+    "must've": "must have",
+    "mustn't": "must not",
+    "mustn't've": "must not have",
+    "needn't": "need not",
+    "needn't've": "need not have",
+    "o'clock": "of the clock",
+    "oughtn't": "ought not",
+    "oughtn't've": "ought not have",
+    "shan't": "shall not",
+    "sha'n't": "shall not",
+    "shan't've": "shall not have",
+    "she'd": "she would",
+    "she'd've": "she would have",
+    "she'll": "she will",
+    "she'll've": "she will have",
+    "she's": "she is",
+    "should've": "should have",
+    "shouldn't": "should not",
+    "shouldn't've": "should not have",
+    "so've": "so have",
+    "so's": "so as",
+    "this's": "this is",
+    "that'd": "that would",
+    "that'd've": "that would have",
+    "that's": "that is",
+    "there'd": "there would",
+    "there'd've": "there would have",
+    "there's": "there is",
+    "here's": "here is",
+    "they'd": "they would",
+    "they'd've": "they would have",
+    "they'll": "they will",
+    "they'll've": "they will have",
+    "they're": "they are",
+    "they've": "they have",
+    "to've": "to have",
+    "wasn't": "was not",
+    "we'd": "we would",
+    "we'd've": "we would have",
+    "we'll": "we will",
+    "we'll've": "we will have",
+    "we're": "we are",
+    "we've": "we have",
+    "weren't": "were not",
+    "what'll": "what will",
+    "what'll've": "what will have",
+    "what're": "what are",
+    "what's": "what is",
+    "what've": "what have",
+    "when's": "when is",
+    "when've": "when have",
+    "where'd": "where did",
+    "where's": "where is",
+    "where've": "where have",
+    "who'll": "who will",
+    "who'll've": "who will have",
+    "who's": "who is",
+    "who've": "who have",
+    "why's": "why is",
+    "why've": "why have",
+    "will've": "will have",
+    "won't": "will not",
+    "won't've": "will not have",
+    "would've": "would have",
+    "wouldn't": "would not",
+    "wouldn't've": "would not have",
+    "y'all": "you all",
+    "y'all'd": "you all would",
+    "y'all'd've": "you all would have",
+    "y'all're": "you all are",
+    "y'all've": "you all have",
+    "you'd": "you would",
+    "you'd've": "you would have",
+    "you'll": "you will",
+    "you'll've": "you will have",
+    "you're": "you are",
+    "you've": "you have"
+  }

src/configs/config.cfg ADDED Viewed

	@@ -0,0 +1,20 @@

+[EMBEDDINGS]
+SENTENCE_TRANSFORMER=nomic-ai/nomic-embed-text-v1.5
+KEYWORD_EXTRACTOR=ml6team/keyphrase-extraction-distilbert-inspec
+SCORING_EMBED=sentence-transformers/all-MiniLM-L6-v2
+[CHUNKING]
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=100
+[ANALYZER]
+TOP_KEYWORDS=20
+MAX_KEYWORDS_SIZE=3
+KEYWORD_MATCH_THRESHOLD=0.75
+RESUME_SUMMARIZER=facebook/bart-large-cnn
+RESUME_MAXLENGTH=150
+RESUME_MINLENGTH=50
+[CANDIDATE]
+RESUME_MATCH_POINT_THRESHOLD=2
+SECTION_MATCH_POINT_THRESHOLD=0.4

src/configs/stopwords.txt ADDED Viewed

	@@ -0,0 +1,758 @@

+a
+able
+about
+above
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+added
+adj
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+ain't
+all
+allow
+allows
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apart
+apparently
+appear
+appreciate
+appropriate
+approximately
+are
+aren
+arent
+aren't
+arise
+around
+as
+a's
+aside
+ask
+asking
+associated
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+cannot
+cant
+can't
+cause
+causes
+certain
+certainly
+changes
+clearly
+c'mon
+co
+com
+come
+comes
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+corresponding
+could
+couldnt
+couldn't
+course
+c's
+currently
+d
+date
+definitely
+described
+despite
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+entirely
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+further
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+going
+gone
+got
+gotten
+greetings
+h
+had
+hadn't
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+he'd
+he'll
+hello
+help
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+here's
+hereupon
+hers
+herself
+hes
+he's
+hi
+hid
+him
+himself
+his
+hither
+home
+hopefully
+how
+howbeit
+however
+how's
+hundred
+i
+id
+i'd
+ie
+if
+ignored
+i'll
+im
+i'm
+immediate
+immediately
+importance
+important
+in
+inasmuch
+inc
+indeed
+index
+indicate
+indicated
+indicates
+information
+inner
+insofar
+instead
+into
+invention
+inward
+is
+isn't
+it
+itd
+it'd
+it'll
+its
+it's
+itself
+i've
+j
+just
+k
+keep
+keeps
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+let's
+like
+liked
+likely
+line
+little
+'ll
+look
+looking
+looks
+ltd
+m
+made
+mainly
+make
+makes
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+merely
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+must
+mustn't
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+novel
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+presumably
+previously
+primarily
+probably
+promptly
+proud
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+reasonably
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+sec
+second
+secondly
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+several
+shall
+shan't
+she
+shed
+she'd
+she'll
+shes
+she's
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+t
+take
+taken
+taking
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+thats
+that's
+that've
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+thered
+therefore
+therein
+there'll
+thereof
+therere
+theres
+there's
+thereto
+thereupon
+there've
+these
+they
+theyd
+they'd
+they'll
+theyre
+they're
+they've
+think
+third
+this
+thorough
+thoroughly
+those
+thou
+though
+thoughh
+thousand
+three
+throug
+through
+throughout
+thru
+thus
+til
+tip
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+ts
+t's
+twice
+two
+u
+un
+under
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+v
+value
+various
+'ve
+very
+via
+viz
+vol
+vols
+vs
+w
+want
+wants
+was
+wasnt
+wasn't
+way
+we
+wed
+we'd
+welcome
+well
+we'll
+went
+were
+we're
+werent
+weren't
+we've
+what
+whatever
+what'll
+whats
+what's
+when
+whence
+whenever
+when's
+where
+whereafter
+whereas
+whereby
+wherein
+wheres
+where's
+whereupon
+wherever
+whether
+which
+while
+whim
+whither
+who
+whod
+whoever
+whole
+who'll
+whom
+whomever
+whos
+who's
+whose
+why
+why's
+widely
+will
+willing
+wish
+with
+within
+without
+wonder
+wont
+won't
+words
+world
+would
+wouldnt
+wouldn't
+www
+x
+y
+yes
+yet
+you
+youd
+you'd
+you'll
+your
+youre
+you're
+yours
+yourself
+yourselves
+you've
+z
+zero

src/mains/candidate_job_match.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+from src.text.chunking import Chunk
+from src.utils.compare_metrics import CompareMetrics
+from src.mains.resume_analyzer import ResumeAnalyzer
+from src.text.embeddings import SentEmbeddings
+from src.utils.commonutils import CommonUtils
+from src.text.text_cleaning import TextCleaner
+import configparser
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+candidate_config = config["CANDIDATE"]
+pointsThreshold = int(candidate_config["RESUME_MATCH_POINT_THRESHOLD"])
+sectionMatchThreshold = float(candidate_config["SECTION_MATCH_POINT_THRESHOLD"])
+class MatchJobCandidate:
+    def __init__(self) -> None:
+        self.compareMetrics = CompareMetrics()
+        self.analyzer = ResumeAnalyzer()
+        self.chunk = Chunk()
+        self.embedding = SentEmbeddings()
+        self.utility = CommonUtils()
+        self.cleaner = TextCleaner()
+        pass
+    def __match(self, jdFile, resumeFile):
+        metric = 0
+        jdChunkList = self.chunk.chunk(jdFile)
+        resumeChunkList = self.chunk.chunk(resumeFile)
+        jdchunkEmbeddings = self.embedding.computeEmbeddingList(jdChunkList)
+        jdresumeEmbeddings = self.embedding.computeEmbeddingList(resumeChunkList)
+        total_compare = len(jdchunkEmbeddings) * len(jdresumeEmbeddings)
+        for i in range(len(jdchunkEmbeddings)):
+            for j in range(len(jdresumeEmbeddings)):
+                metric += self.compareMetrics.cos_sim(jdchunkEmbeddings[i],jdresumeEmbeddings[j])
+        return round((metric*100)/total_compare,2)
+        pass
+    def __keywordsMatch(self, jdFile, resumeFile):
+        jdtext_list = self.chunk.chunk(jdFile)
+        resumeText_list = self.chunk.chunk(resumeFile)
+        keywordsJD=[]
+        for jdtext in jdtext_list:
+            keywordsJD.extend(self.analyzer.extractKeywords(jdtext))
+        keywordsJD = sorted(list(set(keywordsJD)))
+        keywordsRES = []
+        for resumeText in resumeText_list:
+            keywordsRES.extend(self.analyzer.extractKeywords(resumeText))
+        keywordsRES = sorted(list(set(keywordsRES)))
+        resumeKey = []
+        for keyword in keywordsRES:
+            if not self.utility.has_numbers(keyword):
+                resumeKey.append(keyword)
+        return self.analyzer.keywordsPartialMatch(keywordsJD, keywordsRES), resumeKey
+        pass
+    def generatePointers(self, jodDescFolder, resumeFolder):
+        jd_list = os.listdir(jodDescFolder)
+        resume_list = os.listdir(resumeFolder)
+        jd_dict = dict()
+        for jd in jd_list:
+            resume_dict = dict()
+            for resume in resume_list:
+                jdFile = os.path.join(jodDescFolder, jd)
+                resumeFile = os.path.join(resumeFolder, resume)
+                metric = self.__match(jdFile, resumeFile)
+                resume_dict[resume] = metric
+            jd_dict[jd] = {k: v for k, v in sorted(resume_dict.items(), key=lambda item: item[1], reverse=True)}
+        return jd_dict
+        pass
+    def extractJDResumeKeywords(self, jodDescFolder, resumeFolder):
+        jd_list = os.listdir(jodDescFolder)
+        resume_list = os.listdir(resumeFolder)
+        jd_dict = dict()
+        for jd in jd_list:
+            resume_dict = dict()
+            for resume in resume_list:
+                jdFile = os.path.join(jodDescFolder, jd)
+                resumeFile = os.path.join(resumeFolder, resume)
+                resume_dict[resume], resume_dict[resume]["resume_keywords"]  = self.__keywordsMatch(jdFile, resumeFile)
+            jd_dict[jd] = resume_dict
+        return jd_dict
+        pass
+    def getJDResumeScore(self, jodDescFolder, resumeFolder):
+        jd_list = os.listdir(jodDescFolder)
+        resume_list = os.listdir(resumeFolder)
+        jd_dict = dict()
+        for jd in jd_list:
+            jdText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(jodDescFolder, jd)))
+            resume_dict = dict()
+            for resume in resume_list:
+                resumeText = self.cleaner.clean_text(self.chunk.getTextFromPdf(os.path.join(resumeFolder, resume)))
+                results = self.compareMetrics.get_score(resumeText, jdText)
+                resume_dict[resume] = results[0].score
+            jd_dict[jd] = resume_dict
+        return jd_dict
+    pass
+if __name__ == "__main__":
+    match = MatchJobCandidate()
+    jodDescFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/JDS"
+    resumeFolder = "D:/Study Material/HR Assist/Code/Talent-Scout-AI/test_data/RESUMES"
+    match.run(jodDescFolder, resumeFolder)
+    pass

src/mains/resume_analyzer.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from src.text.text_cleaning import TextCleaner
+from src.text.embeddings import SentEmbeddings
+from src.utils.compare_metrics import CompareMetrics
+import configparser, os
+from src.text.keywords import KeyphraseExtractionPipeline
+from src.text.chunking import Chunk
+from transformers import pipeline
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+analyzer_config = config["ANALYZER"]
+topKey = float(analyzer_config["TOP_KEYWORDS"])
+maxGram = float(analyzer_config["MAX_KEYWORDS_SIZE"])
+matchThreshold = float(analyzer_config["KEYWORD_MATCH_THRESHOLD"])
+resume_summarizer = analyzer_config["RESUME_SUMMARIZER"]
+maxlength = int(analyzer_config["RESUME_MAXLENGTH"])
+minlength = int(analyzer_config["RESUME_MINLENGTH"])
+class ResumeAnalyzer:
+    def __init__(self) -> None:
+        self.keywordExtractor = KeyphraseExtractionPipeline()
+        self.cleaning = TextCleaner()
+        self.embeddings = SentEmbeddings()
+        self.compare = CompareMetrics()
+        self.chunk = Chunk(chunksize=1000, overlap=100)
+        self.summarizer = pipeline("summarization", model=resume_summarizer)
+        pass
+    def extractKeywords(self, text):
+        keywords = self.keywordExtractor(text)
+        keylist = []
+        for kw in keywords:
+            keylist.append(self.cleaning.clean_text(kw))
+        return keylist
+        pass
+    def keywordsPartialMatch(self, jdKeywords, resumeKeywords):
+        jdKeywords = sorted(list(set(jdKeywords)))
+        resumeKeywords = sorted(list(set(resumeKeywords)))
+        jdKeywords_embed = self.embeddings.computeEmbeddingList(jdKeywords)
+        resumeKeywords_embed = self.embeddings.computeEmbeddingList(resumeKeywords)
+        match_jd_res_key = dict()
+        for i in range(len(jdKeywords)):
+            resKeys = []
+            for j in range(len(resumeKeywords)):
+                metric = self.compare.cos_sim(jdKeywords_embed[i], resumeKeywords_embed[j])
+                if metric > matchThreshold:
+                    resKeys.append(resumeKeywords[j])
+            if resKeys:
+                match_jd_res_key[jdKeywords[i]] = resKeys
+        return match_jd_res_key
+        pass
+    def __summarizeBatch(self, textBatch):
+        return self.summarizer(textBatch, max_length=maxlength, min_length=minlength, do_sample=False)
+        pass
+    def resumeBatchSummarizer(self, resumeFolder):
+        resume_list = os.listdir(resumeFolder)
+        resumeSummarize = dict()
+        for resumeFile in resume_list:
+            file = os.path.join(resumeFolder, resumeFile)
+            resumeChunk_list = self.chunk.chunk(file)
+            response = self.__summarizeBatch(resumeChunk_list)
+            print(response)
+            summarize = ""
+            for summary in response:
+                summarize += " "+str(summary['summary_text'])
+            resumeSummarize[resumeFile] = summarize
+        return resumeSummarize
+        pass
+    pass

src/mains/resume_metadata.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import re, os
+from pdfminer.high_level import extract_text
+import spacy
+from spacy.matcher import Matcher
+from src.utils.commonutils import CommonUtils
+from src.mains.resume_analyzer import ResumeAnalyzer
+class ResumeMetaData():
+    def __init__(self) -> None:
+        self.utils = CommonUtils()
+        self.analyzer = ResumeAnalyzer()
+        pass
+    def extract_text_from_pdf(self, pdf_path):
+        return extract_text(pdf_path)
+    def extract_contact_number_from_resume(self, text):
+        contact_number = None
+        # Use regex pattern to find a potential contact number
+        pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
+        tmp = re.findall(pattern,text)
+        r1 = '[^0-9]+'
+        contact_number_list = []
+        for con in tmp:
+            contact_number_list.append(re.sub(r1, "", con)[-10:])
+        contact_number = ", ".join(contact_number_list)
+        return contact_number
+    def extract_email_from_resume(self, text):
+        email = None
+        # Use regex pattern to find a potential email address
+        pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
+        email = ", ".join(re.findall(pattern,text))
+        return email
+    def extract_education_from_resume(self, text):
+        education = []
+        # Use regex pattern to find education information
+        pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
+        matches = re.findall(pattern, text)
+        for match in matches:
+            education.append(match.strip())
+        return education
+    def extract_name(self, resume_text):
+        nlp = spacy.load('en_core_web_lg')
+        matcher = Matcher(nlp.vocab)
+        # Define name patterns
+        patterns = [
+            [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
+            [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
+            [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
+            # Add more patterns as needed
+        ]
+        for pattern in patterns:
+            matcher.add('NAME', patterns=[pattern])
+        doc = nlp(resume_text)
+        matches = matcher(doc)
+        for match_id, start, end in matches:
+            span = doc[start:end]
+            return span.text
+        return None
+    def extract_links_extended(self, text):
+        links = []
+        pattern = r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b'
+        links = re.findall(pattern, text)
+        for link in links:
+            if "/" not in link:
+                links.remove(link)
+        return links
+    def extract_keywords(self, text):
+        return self.analyzer.extractKeywords(text)
+    def extractMetaData(self, resumeFolder):
+        resume_list = os.listdir(resumeFolder)
+        resume_info = dict()
+        for resume in resume_list:
+            print(resume)
+            meta_data = dict()
+            resume_path = os.path.join(resumeFolder, resume)
+            text = self.extract_text_from_pdf(resume_path)
+            name = self.extract_name(text)
+            if name:
+                meta_data["Name"] = name
+            else:
+                meta_data["Name"] = ""
+            contact_number = self.extract_contact_number_from_resume(text)
+            if contact_number:
+                meta_data["Contact Number"] = contact_number
+            else:
+                meta_data["Contact Number"] = ""
+            email = self.extract_email_from_resume(text)
+            if email:
+                meta_data["Email"] = email
+            else:
+                print("Email not found")
+            extracted_education = self.extract_education_from_resume(text)
+            if extracted_education:
+                meta_data["Education"] = extracted_education
+            else:
+                meta_data["Education"] = ""
+            extracted_links = self.extract_links_extended(text)
+            if extracted_education:
+                meta_data["Links"] = extracted_links
+            else:
+                meta_data["Links"] = ""
+            extracted_keywords = self.extract_keywords(text)
+            if extracted_keywords:
+                meta_data["Skills"] = extracted_keywords
+            else:
+                meta_data["Skills"] = ""
+            resume_info[resume] = meta_data
+        return resume_info
+    pass
+if __name__ == '__main__':
+    resumeFolder = "D:/Study Material/Projects/HR Assist/Code/test_data/RESUMES"
+    metadata = ResumeMetaData()
+    info = metadata.extractMetaData(resumeFolder)
+    print(info)
+    pass

src/text/chunking.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import fitz
+from semantic_text_splitter import TextSplitter
+import configparser
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+chunk_config = config["CHUNKING"]
+class Chunk:
+    def __init__(self, chunksize=int(chunk_config["CHUNK_SIZE"]), overlap=int(chunk_config["CHUNK_OVERLAP"])) -> None:
+        self.splitter = TextSplitter(capacity=chunksize, overlap=overlap)
+    def chunk(self, inputFileLoc) -> list:
+        doc = fitz.open(inputFileLoc)
+        text = ""
+        for page in doc:
+            text += " "+ page.get_text()
+        chunks = self.splitter.chunks(text)
+        return chunks
+    def getTextFromPdf(self, inputFileLoc) -> list:
+        doc = fitz.open(inputFileLoc)
+        text = ""
+        for page in doc:
+            text += " "+ page.get_text()
+        return text
+if __name__ == "__main__":
+    input_file = '../test_data/RESUMES/AnanyaDasResume.pdf'
+    chunker = Chunk()
+    print(chunker.chunk(input_file))

src/text/embeddings.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from sentence_transformers import SentenceTransformer
+from src.text.text_cleaning import TextCleaner
+import configparser
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+embed_config = config["EMBEDDINGS"]
+class SentEmbeddings():
+    def __init__(self) -> None:
+        self.model = SentenceTransformer(embed_config['SENTENCE_TRANSFORMER'], trust_remote_code=True, device='cuda')
+        pass
+    def computeEmbedding(self, sentence):
+        cleaner = TextCleaner()
+        clean_sent = cleaner.clean_text(sentence)
+        return self.model.encode(clean_sent)
+        pass
+    def computeEmbeddingList(self, sentenceList):
+        cleaner = TextCleaner()
+        cleaned_sentList = []
+        for i in range(len(sentenceList)):
+           cleaned_sentList.append(cleaner.clean_text(sentenceList[i]))
+        return self.model.encode(cleaned_sentList)
+        pass
+    pass
+if __name__ == "__main__":
+    embed = SentEmbeddings()
+    test_sent = """This isn't a panda,,,, you are wrong this is a well versed bear    ..
+                                        which you'll never understand!!!!!!!!!!!!!!!!"""
+    embedding = embed.computeEmbedding(test_sent)
+    print(embedding)
+    pass

src/text/keywords.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import TokenClassificationPipeline, AutoModelForTokenClassification, AutoTokenizer
+from transformers.pipelines import AggregationStrategy
+import numpy as np
+import configparser
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+embed_config = config["EMBEDDINGS"]
+class KeyphraseExtractionPipeline(TokenClassificationPipeline):
+    def __init__(self,):
+        super().__init__(
+            model=AutoModelForTokenClassification.from_pretrained(str(embed_config["KEYWORD_EXTRACTOR"])),
+            tokenizer=AutoTokenizer.from_pretrained(embed_config["KEYWORD_EXTRACTOR"], device_map = 'cuda')
+        )
+    def postprocess(self, all_outputs):
+        results = super().postprocess(
+            all_outputs=all_outputs,
+            aggregation_strategy=AggregationStrategy.FIRST,
+        )
+        return np.unique([result.get("word").strip() for result in results])

src/text/text_cleaning.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import re
+from nltk.stem import WordNetLemmatizer
+from src.utils.commonutils import CommonUtils
+class TextCleaner:
+    def __init__(self) -> None:
+        self.lemmatizer = WordNetLemmatizer()
+        self.comonUtils = CommonUtils()
+        self.stopwords = self.comonUtils.loadStropwords()
+        self.abbr_words = self.comonUtils.loadAbbreviations()
+        pass
+    def __remove_html_tags(self, text):
+        clean_text = re.sub(r'<.*?>', '', text)
+        return clean_text
+    def __remove_special_characters(self, text):
+        clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+        return clean_text
+    def __convert_to_lowercase(self, text):
+        lowercased_text = text.lower()
+        return lowercased_text
+    def __change_abbr(self, text):
+        abbreviation = ' '.join([self.abbr_words[t] if t in self.abbr_words else t for t in text.split(" ")])
+        return abbreviation
+    def __remove_whitespace(self, text):
+        cleaned_text = ' '.join(text.split())
+        return cleaned_text
+    def __lemmatize_text(self, tokens):
+        lemmatized_tokens = ' '.join([self.lemmatizer.lemmatize(word) for word in tokens.split()])
+        return lemmatized_tokens
+    def remove_stopwords(self, tokens):
+        filtered_tokens = ' '.join([word for word in tokens.split() if word not in self.stopwords])
+        return filtered_tokens
+    def remove_numbers(self, text):
+        result = re.sub(r'[0-9]+', ' ', text)
+        result = self.__remove_whitespace(result)
+        return result
+    def clean_text(self, text):
+        sentence = self.__remove_html_tags(text)
+        sentence = self.__change_abbr(sentence)
+        sentence = self.__lemmatize_text(sentence)
+        sentence = self.__remove_special_characters(sentence)
+        sentence = self.__convert_to_lowercase(sentence)
+        sentence = self.__remove_whitespace(sentence)
+        return sentence
+        pass

src/utils/commonutils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os, json, re
+from datetime import datetime
+from dateutil import relativedelta
+class CommonUtils:
+    def __init__(self) -> None:
+        pass
+    def loadStropwords(self,):
+        with open(os.path.join("src", "configs", "stopwords.txt"), "r") as g:
+            stopwords = g.read().splitlines()
+        return stopwords
+    def loadAbbreviations(self,):
+        with open(os.path.join("src", "configs", "abbr.json"), "r") as json_file:
+            data = json.load(json_file)
+        return data
+    def has_numbers(self, inputString):
+        return bool(re.search(r'\d', inputString))
+    def get_number_of_months_from_dates(date1, date2):
+        if date2.lower() == 'present':
+            date2 = datetime.now().strftime('%b %Y')
+        try:
+            if len(date1.split()[0]) > 3:
+                date1 = date1.split()
+                date1 = date1[0][:3] + ' ' + date1[1]
+            if len(date2.split()[0]) > 3:
+                date2 = date2.split()
+                date2 = date2[0][:3] + ' ' + date2[1]
+        except IndexError:
+            return 0
+        try:
+            date1 = datetime.strptime(str(date1), '%b %Y')
+            date2 = datetime.strptime(str(date2), '%b %Y')
+            months_of_experience = relativedelta.relativedelta(date2, date1)
+            months_of_experience = (months_of_experience.years
+                                    * 12 + months_of_experience.months)
+        except ValueError:
+            return 0
+        return months_of_experience
+    pass
+if __name__ == "__main__":
+    cu = CommonUtils()
+    print(type(cu.loadAbbreviations()))
+    print(cu.loadAbbreviations())
+    pass

src/utils/compare_metrics.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from sentence_transformers import util
+from src.text.embeddings import SentEmbeddings
+from src.text.text_cleaning import TextCleaner
+from typing import List
+from qdrant_client import QdrantClient
+import configparser
+config = configparser.ConfigParser()
+config.read("src/configs/config.cfg")
+embed_config = config["EMBEDDINGS"]
+class CompareMetrics:
+    def __init__(self) -> None:
+        self.sentEmbedding = SentEmbeddings()
+        self.textCleaner = TextCleaner()
+        pass
+    def dot_score(self, emb1, emb2):
+        return round(util.dot_score(emb1, emb2).numpy()[0][0].tolist(),2)
+    def cos_sim(self, emb1, emb2):
+        return round(util.cos_sim(emb1, emb2).numpy()[0][0].tolist(),2)
+    def calculate_similarity(self, sent1, sent2):
+        metrics = dict()
+        cleaned_sent1 = self.textCleaner.clean_text(sent1)
+        cleaned_sent2 = self.textCleaner.clean_text(sent2)
+        emb1 = self.sentEmbedding.computeEmbedding(cleaned_sent1)
+        emb2 = self.sentEmbedding.computeEmbedding(cleaned_sent2)
+        metrics['dot_score'] = self.dot_score(emb1, emb2)
+        metrics['cos_sim'] = self.cos_sim(emb1, emb2)
+        ## sending only cos_sim as both are same
+        return metrics['cos_sim']
+    def get_score(self, resume_string, job_description_string):
+        documents: List[str] = [resume_string]
+        client = QdrantClient(":memory:")
+        client.set_model(embed_config['SCORING_EMBED'])
+        client.add(
+            collection_name="demo_collection",
+            documents=documents,
+        )
+        search_result = client.query(
+            collection_name="demo_collection", query_text=job_description_string
+        )
+        return search_result
+    pass

src/utils/scout.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import google.generativeai as genai
+import textwrap
+def to_markdown(text):
+  text = text.replace('•', '  *')
+  return textwrap.indent(text, '> ', predicate=lambda _: True)
+GOOGLE_API_KEY="AIzaSyDXgb_tauJ6Au_puSi0Lqht1nRuFskOkHQ" #userdata.get('GOOGLE_API_KEY')
+genai.configure(api_key=GOOGLE_API_KEY)
+for m in genai.list_models():
+  if 'generateContent' in m.supported_generation_methods:
+    print(m.name)
+model = genai.GenerativeModel('gemini-pro',)
+response = model.generate_content("What is the meaning of life?")
+print(response.text)

static/scripts.js ADDED Viewed

	@@ -0,0 +1,54 @@

+document.getElementById('compare-button').addEventListener('click', function() {
+    var jdfiles = document.getElementById('jd');
+    var resfiles = document.getElementById('resume');
+    document.getElementById('comparison-output').innerText = 'Genarating Response..';
+    if (jdfiles.value.length < 1 || resfiles.value.length < 1)  {
+        alert("Please select pdf to upload..");
+        return false;
+     }
+     else if(jdfiles.files.length > 1){
+        alert("Max 1 file can be uploaded in JD.");
+        return false;
+     }
+     else if(resfiles.files.length > 5){
+        alert("Max 5 files can be uploaded in Resume.");
+        return false;
+     }
+    const formData = new FormData();
+    for (var x = 0; x < jdfiles.files.length; x++) {
+        formData.append("jdfiles", jdfiles.files[x]);
+    }
+    for (var x = 0; x < resfiles.files.length; x++) {
+        formData.append("resfiles", resfiles.files[x]);
+    }
+    formData.append('jdfiles', jdfiles.files[0]);
+    formData.append('resfiles', resfiles.files[0]);
+    fetch('http://127.0.0.1:8080/summarize_resume', {
+        method: 'POST',
+        body: formData
+    })
+    .then(response => response.json())
+    .then(data => {
+        document.getElementById('comparison-output').innerText = JSON.stringify(data, null, 2);
+    })
+    .catch(error => {
+        console.error('Error:', error);
+        document.getElementById('comparison-output').innerText = 'An error occurred during comparison.';
+    });
+});
+document.getElementById('clear-button').addEventListener('click', function() {
+    document.getElementById('upload-form').reset();
+    document.getElementById('comparison-output').innerText = '';
+});

static/styles.css ADDED Viewed

	@@ -0,0 +1,84 @@

+body {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background: linear-gradient(0.25turn, #3f87a6, #ebf8e1, #f69d3c);
+    margin: 0;
+    padding: 0;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 100vh;
+    color: #fff;
+}
+.container {
+    background: #fff;
+    padding: 40px 50px;
+    box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
+    border-radius: 10px;
+    text-align: center;
+    width: 90%;
+    max-width: 500px;
+    color: #333;
+}
+h1 {
+    margin-bottom: 25px;
+    font-size: 2em;
+    color: #800020;
+}
+.file-input {
+    margin-bottom: 20px;
+}
+label {
+    display: block;
+    margin-bottom: 10px;
+    font-weight: bold;
+    font-size: 1.1em;
+    color: #800020;
+}
+input[type="file"] {
+    width: 100%;
+    padding: 10px;
+    border: 2px solid #ddd;
+    border-radius: 5px;
+    font-size: 1em;
+    transition: border-color 0.3s ease;
+}
+input[type="file"]:focus {
+    border-color: #800020;
+    outline: none;
+}
+button {
+    padding: 12px 25px;
+    background: #800020;
+    color: #fff;
+    border: none;
+    border-radius: 25px;
+    cursor: pointer;
+    font-size: 1em;
+    transition: background 0.3s ease, transform 0.3s ease;
+    box-shadow: 0 5px 15px rgba(128, 0, 32, 0.2);
+}
+button:hover {
+    background: #4c0014;
+    transform: translateY(-2px);
+}
+#results {
+    margin-top: 35px;
+}
+#comparison-output {
+    padding: 20px;
+    background: #f1f1f1;
+    border-radius: 5px;
+    box-shadow: inset 0 0 10px rgba(0, 0, 0, 0.1);
+    text-align: left;
+    white-space: pre-wrap;
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,31 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Talent Scout AI</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
+</head>
+<body>
+    <div class="container">
+        <h1>Talent Scout AI</h1>
+        <form id="upload-form">
+            <div class="file-input">
+                <label for="jd">Upload JD:</label>
+                <input type="file" id="jd" accept="application/pdf" required multiple>
+            </div>
+            <div class="file-input">
+                <label for="resume">Upload RESUME:</label>
+                <input type="file" id="resume" accept="application/pdf" required multiple>
+            </div>
+            <button type="button" id="compare-button">Compare</button>
+            <button type="button" id="clear-button">Clear All</button>
+        </form>
+        <div id="results">
+            <h2>Comparison Results</h2>
+            <div id="comparison-output" style="overflow-y: scroll; height:200px;"></div>
+        </div>
+    </div>
+    <script src="{{ url_for('static', filename='scripts.js') }}"></script>
+</body>
+</html>