prasadbobby commited on
Commit
fabc63c
1 Parent(s): 57d4231

Add application file

Browse files
Demo/Interface.png ADDED
Demo/Interface_Results.png ADDED
Demo/Workflow.png ADDED
Models.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gensim
2
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
3
+ from nltk.tokenize import word_tokenize
4
+ from gensim.models.doc2vec import Doc2Vec
5
+ import nltk
6
+ from transformers import AutoTokenizer, AutoModel
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import torch
9
+ import numpy as np
10
+ import streamlit as st
11
+
12
+ #Mean Pooling - Take attention mask into account for correct averaging
13
+ def mean_pooling(model_output, attention_mask):
14
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
15
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
16
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
17
+
18
+
19
+ @st.cache_resource
20
+ def get_HF_embeddings(sentences):
21
+
22
+ # Load model from HuggingFace Hub
23
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
24
+ model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
25
+ # Tokenize sentences
26
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
27
+ # Compute token embeddings
28
+ with torch.no_grad():
29
+ model_output = model(**encoded_input)
30
+ # Perform pooling. In this case, max pooling.
31
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
32
+
33
+ # print("Sentence embeddings:")
34
+ # print(embeddings)
35
+ return embeddings
36
+
37
+
38
+ @st.cache_data
39
+ def get_doc2vec_embeddings(JD, text_resume):
40
+ nltk.download("punkt")
41
+ data = [JD]
42
+ resume_embeddings = []
43
+
44
+ tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
45
+ #print (tagged_data)
46
+
47
+ model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80)
48
+ model.build_vocab(tagged_data)
49
+ model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
50
+ JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1))
51
+
52
+ for i in text_resume:
53
+ text = word_tokenize(i.lower())
54
+ embeddings = model.infer_vector(text)
55
+ resume_embeddings.append(np.transpose(embeddings.reshape(-1,1)))
56
+ return (JD_embeddings, resume_embeddings)
57
+
58
+
59
+ def cosine(embeddings1, embeddings2):
60
+ # get the match percentage
61
+ score_list = []
62
+ for i in embeddings1:
63
+ matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2))
64
+ matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal
65
+ print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.")
66
+ score_list.append(str(matchPercentage[0][0]))
67
+ return score_list
README.md CHANGED
@@ -1,12 +1,26 @@
1
- ---
2
- title: Ats Resume
3
- emoji: 🚀
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.33.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume Screening App
2
+ This app is built for employers looking for candidates against a particular job description. This app looks into outputing a x% percent similarity score given the resume of the candidate and a job description.
3
+
4
+ App deployed on [Streamlit Community Cloud](https://soumee2000-applicant-tracking-system-application-tqrpm0.streamlit.app/)
5
+
6
+ ## Intuition:
7
+ 1. Get [context-aware BERT Embeddings](https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b) or [document doc2vec embeddings](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) for Resume and Job Description.
8
+ 2. [Hugging Face](https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens) Library was very useful alongwith doc2vec or nltk
9
+ 3. Get their [cosine similarity](https://developers.google.com/machine-learning/clustering/similarity/measuring-similarity)
10
+
11
+ ## Workflow:
12
+ <img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Workflow.png">
13
+
14
+ ## Interface
15
+ <img src = "https://github.com/SOUMEE2000/Resume_Scanner/blob/main/Demo/Interface.png" height=400>
16
+ <img src = "https://github.com/SOUMEE2000/Applicant_Tracking_System/blob/main/Demo/Interface_Results.png" height = 400 width = 800>
17
+
18
+ ## Usage
19
+
20
+ ```
21
+ pip install -r requirements.txt
22
+ ```
23
+ **Run**: ``` streamlit run application.py```
24
+
25
+
26
+
Resume_scanner.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from Models import get_HF_embeddings, cosine, get_doc2vec_embeddings
4
+
5
+ def compare(resume_texts, JD_text, flag='HuggingFace-BERT'):
6
+ JD_embeddings = None
7
+ resume_embeddings = []
8
+
9
+ if flag == 'HuggingFace-BERT':
10
+ if JD_text is not None:
11
+ JD_embeddings = get_HF_embeddings(JD_text)
12
+ for resume_text in resume_texts:
13
+ resume_embeddings.append(get_HF_embeddings(resume_text))
14
+
15
+ if JD_embeddings is not None and resume_embeddings is not None:
16
+ cos_scores = cosine(resume_embeddings, JD_embeddings)
17
+ return cos_scores
18
+
19
+ # Add logic for other flags like 'Doc2Vec' if necessary
20
+ else:
21
+ # Handle other cases
22
+ pass
__pycache__/Models.cpython-37.pyc ADDED
Binary file (2.52 kB). View file
 
__pycache__/Resume_Scanner.cpython-37.pyc ADDED
Binary file (962 Bytes). View file
 
application.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import streamlit as st
3
+ import pdfplumber
4
+ from Resume_scanner import compare
5
+
6
+
7
+ def extract_pdf_data(file_path):
8
+ data = ""
9
+ with pdfplumber.open(file_path) as pdf:
10
+ for page in pdf.pages:
11
+ text = page.extract_text()
12
+ if text:
13
+ data += text
14
+ return data
15
+
16
+
17
+ def extract_text_data(file_path):
18
+ with open(file_path, 'r') as file:
19
+ data = file.read()
20
+ return data
21
+
22
+
23
+ # Command-line argument processing
24
+ if len(sys.argv) > 1:
25
+
26
+ if len(sys.argv) == 3:
27
+ resume_path = sys.argv[1]
28
+ jd_path = sys.argv[2]
29
+
30
+ resume_data = extract_pdf_data(resume_path)
31
+ jd_data = extract_text_data(jd_path)
32
+
33
+ result = compare([resume_data], jd_data, flag='HuggingFace-BERT')
34
+
35
+ sys.exit()
36
+
37
+ # Sidebar
38
+ flag = 'HuggingFace-BERT'
39
+ with st.sidebar:
40
+ st.markdown('**Which embedding do you want to use**')
41
+ options = st.selectbox('Which embedding do you want to use',
42
+ ['HuggingFace-BERT', 'Doc2Vec'],
43
+ label_visibility="collapsed")
44
+ flag = options
45
+
46
+ # Main content
47
+ tab1, tab2 = st.tabs(["**Home**", "**Results**"])
48
+
49
+ # Tab Home
50
+ with tab1:
51
+ st.title("Applicant Tracking System")
52
+ uploaded_files = st.file_uploader(
53
+ '**Choose your resume.pdf file:** ', type="pdf", accept_multiple_files=True)
54
+ JD = st.text_area("**Enter the job description:**")
55
+ comp_pressed = st.button("Compare!")
56
+ if comp_pressed and uploaded_files:
57
+ # Streamlit file_uploader gives file-like objects, not paths
58
+ uploaded_file_paths = [extract_pdf_data(
59
+ file) for file in uploaded_files]
60
+ score = compare(uploaded_file_paths, JD, flag)
61
+
62
+ # Tab Results
63
+ with tab2:
64
+ st.header("Results")
65
+ my_dict = {}
66
+ if comp_pressed and uploaded_files:
67
+ for i in range(len(score)):
68
+ my_dict[uploaded_files[i].name] = score[i]
69
+ sorted_dict = dict(sorted(my_dict.items()))
70
+ for i in sorted_dict.items():
71
+ with st.expander(str(i[0])):
72
+ st.write("Score is: ", i[1])
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ pdfplumber
5
+ nltk
6
+ gensim
7
+ scikit-learn