Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,8 +2,6 @@ import os
|
|
2 |
import re
|
3 |
import fitz # Importing PyMuPDF for PDF text extraction
|
4 |
import nltk
|
5 |
-
from transformers import BertTokenizer, BertModel
|
6 |
-
import torch
|
7 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
@@ -14,10 +12,6 @@ import gradio as gr
|
|
14 |
nltk.download('punkt')
|
15 |
nltk.download('stopwords')
|
16 |
|
17 |
-
# Load BERT model and tokenizer
|
18 |
-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
19 |
-
model = BertModel.from_pretrained('bert-base-uncased')
|
20 |
-
|
21 |
# Function to preprocess text
|
22 |
def preprocess_text(text):
|
23 |
text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
|
@@ -79,25 +73,12 @@ def give_feedback(resume_text, job_description):
|
|
79 |
|
80 |
return feedback
|
81 |
|
82 |
-
# Function to compute BERT embeddings
|
83 |
-
def get_bert_embeddings(text):
|
84 |
-
tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
|
85 |
-
with torch.no_grad():
|
86 |
-
outputs = model(**tokens)
|
87 |
-
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
88 |
-
|
89 |
-
# Function to calculate BERT similarity score
|
90 |
-
def bert_similarity(resume, job_description):
|
91 |
-
resume_embedding = get_bert_embeddings(resume)
|
92 |
-
job_description_embedding = get_bert_embeddings(job_description)
|
93 |
-
cosine_sim = cosine_similarity([resume_embedding], [job_description_embedding])
|
94 |
-
return cosine_sim[0][0]
|
95 |
-
|
96 |
# Function to calculate TF-IDF cosine similarity score
|
97 |
def tfidf_cosine_similarity(resume, jd):
|
98 |
documents = [resume, jd]
|
99 |
vectorizer = TfidfVectorizer()
|
100 |
tfidf_matrix = vectorizer.fit_transform(documents)
|
|
|
101 |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
102 |
return cosine_sim[0][0]
|
103 |
|
@@ -105,6 +86,7 @@ def tfidf_cosine_similarity(resume, jd):
|
|
105 |
def doc2vec_cosine_similarity(resume, jd, model):
|
106 |
resume_vector = model.infer_vector(resume.split())
|
107 |
jd_vector = model.infer_vector(jd.split())
|
|
|
108 |
cosine_sim = cosine_similarity([resume_vector], [jd_vector])
|
109 |
return cosine_sim[0][0]
|
110 |
|
@@ -127,7 +109,7 @@ def extract_info_from_resumes(resume_files, job_description):
|
|
127 |
|
128 |
documents.append(preprocess_text(job_description))
|
129 |
tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
|
130 |
-
|
131 |
|
132 |
for file in resume_files:
|
133 |
text = extract_text_from_pdf(file.name)
|
@@ -149,15 +131,13 @@ def extract_info_from_resumes(resume_files, job_description):
|
|
149 |
common_keywords = set(jd_keywords).intersection(set(resume_keywords))
|
150 |
keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
|
151 |
tfidf_score = tfidf_cosine_similarity(text, job_description)
|
152 |
-
doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description),
|
153 |
-
bert_score = bert_similarity(preprocessed_text, preprocess_text(job_description))
|
154 |
|
155 |
data.append({
|
156 |
'Name': name,
|
157 |
'Keyword_Match_Score': keyword_match_score, # Whole number
|
158 |
'TFIDF_Score': tfidf_score,
|
159 |
'Doc2Vec_Score': doc2vec_score,
|
160 |
-
'BERT_Score': bert_score, # Add BERT score
|
161 |
'Years_of_Experience': years_of_experience,
|
162 |
'Feedback': '; '.join(feedback), # Combine feedback into a single string
|
163 |
})
|
@@ -197,4 +177,4 @@ iface = gr.Interface(
|
|
197 |
)
|
198 |
|
199 |
# Launch the Gradio interface
|
200 |
-
iface.launch(
|
|
|
2 |
import re
|
3 |
import fitz # Importing PyMuPDF for PDF text extraction
|
4 |
import nltk
|
|
|
|
|
5 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
12 |
nltk.download('punkt')
|
13 |
nltk.download('stopwords')
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# Function to preprocess text
|
16 |
def preprocess_text(text):
|
17 |
text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
|
|
|
73 |
|
74 |
return feedback
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
# Function to calculate TF-IDF cosine similarity score
|
77 |
def tfidf_cosine_similarity(resume, jd):
|
78 |
documents = [resume, jd]
|
79 |
vectorizer = TfidfVectorizer()
|
80 |
tfidf_matrix = vectorizer.fit_transform(documents)
|
81 |
+
|
82 |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
83 |
return cosine_sim[0][0]
|
84 |
|
|
|
86 |
def doc2vec_cosine_similarity(resume, jd, model):
|
87 |
resume_vector = model.infer_vector(resume.split())
|
88 |
jd_vector = model.infer_vector(jd.split())
|
89 |
+
|
90 |
cosine_sim = cosine_similarity([resume_vector], [jd_vector])
|
91 |
return cosine_sim[0][0]
|
92 |
|
|
|
109 |
|
110 |
documents.append(preprocess_text(job_description))
|
111 |
tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
|
112 |
+
model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
|
113 |
|
114 |
for file in resume_files:
|
115 |
text = extract_text_from_pdf(file.name)
|
|
|
131 |
common_keywords = set(jd_keywords).intersection(set(resume_keywords))
|
132 |
keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
|
133 |
tfidf_score = tfidf_cosine_similarity(text, job_description)
|
134 |
+
doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model)
|
|
|
135 |
|
136 |
data.append({
|
137 |
'Name': name,
|
138 |
'Keyword_Match_Score': keyword_match_score, # Whole number
|
139 |
'TFIDF_Score': tfidf_score,
|
140 |
'Doc2Vec_Score': doc2vec_score,
|
|
|
141 |
'Years_of_Experience': years_of_experience,
|
142 |
'Feedback': '; '.join(feedback), # Combine feedback into a single string
|
143 |
})
|
|
|
177 |
)
|
178 |
|
179 |
# Launch the Gradio interface
|
180 |
+
iface.launch()
|