Prernas19 commited on
Commit
3a52026
·
verified ·
1 Parent(s): d91a2ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -25
app.py CHANGED
@@ -2,8 +2,6 @@ import os
2
  import re
3
  import fitz # Importing PyMuPDF for PDF text extraction
4
  import nltk
5
- from transformers import BertTokenizer, BertModel
6
- import torch
7
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
@@ -14,10 +12,6 @@ import gradio as gr
14
  nltk.download('punkt')
15
  nltk.download('stopwords')
16
 
17
- # Load BERT model and tokenizer
18
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
19
- model = BertModel.from_pretrained('bert-base-uncased')
20
-
21
  # Function to preprocess text
22
  def preprocess_text(text):
23
  text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
@@ -79,25 +73,12 @@ def give_feedback(resume_text, job_description):
79
 
80
  return feedback
81
 
82
- # Function to compute BERT embeddings
83
- def get_bert_embeddings(text):
84
- tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
85
- with torch.no_grad():
86
- outputs = model(**tokens)
87
- return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
88
-
89
- # Function to calculate BERT similarity score
90
- def bert_similarity(resume, job_description):
91
- resume_embedding = get_bert_embeddings(resume)
92
- job_description_embedding = get_bert_embeddings(job_description)
93
- cosine_sim = cosine_similarity([resume_embedding], [job_description_embedding])
94
- return cosine_sim[0][0]
95
-
96
  # Function to calculate TF-IDF cosine similarity score
97
  def tfidf_cosine_similarity(resume, jd):
98
  documents = [resume, jd]
99
  vectorizer = TfidfVectorizer()
100
  tfidf_matrix = vectorizer.fit_transform(documents)
 
101
  cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
102
  return cosine_sim[0][0]
103
 
@@ -105,6 +86,7 @@ def tfidf_cosine_similarity(resume, jd):
105
  def doc2vec_cosine_similarity(resume, jd, model):
106
  resume_vector = model.infer_vector(resume.split())
107
  jd_vector = model.infer_vector(jd.split())
 
108
  cosine_sim = cosine_similarity([resume_vector], [jd_vector])
109
  return cosine_sim[0][0]
110
 
@@ -127,7 +109,7 @@ def extract_info_from_resumes(resume_files, job_description):
127
 
128
  documents.append(preprocess_text(job_description))
129
  tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
130
- model_doc2vec = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
131
 
132
  for file in resume_files:
133
  text = extract_text_from_pdf(file.name)
@@ -149,15 +131,13 @@ def extract_info_from_resumes(resume_files, job_description):
149
  common_keywords = set(jd_keywords).intersection(set(resume_keywords))
150
  keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
151
  tfidf_score = tfidf_cosine_similarity(text, job_description)
152
- doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model_doc2vec)
153
- bert_score = bert_similarity(preprocessed_text, preprocess_text(job_description))
154
 
155
  data.append({
156
  'Name': name,
157
  'Keyword_Match_Score': keyword_match_score, # Whole number
158
  'TFIDF_Score': tfidf_score,
159
  'Doc2Vec_Score': doc2vec_score,
160
- 'BERT_Score': bert_score, # Add BERT score
161
  'Years_of_Experience': years_of_experience,
162
  'Feedback': '; '.join(feedback), # Combine feedback into a single string
163
  })
@@ -197,4 +177,4 @@ iface = gr.Interface(
197
  )
198
 
199
  # Launch the Gradio interface
200
- iface.launch(inline = False)
 
2
  import re
3
  import fitz # Importing PyMuPDF for PDF text extraction
4
  import nltk
 
 
5
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
 
12
  nltk.download('punkt')
13
  nltk.download('stopwords')
14
 
 
 
 
 
15
  # Function to preprocess text
16
  def preprocess_text(text):
17
  text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
 
73
 
74
  return feedback
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # Function to calculate TF-IDF cosine similarity score
77
  def tfidf_cosine_similarity(resume, jd):
78
  documents = [resume, jd]
79
  vectorizer = TfidfVectorizer()
80
  tfidf_matrix = vectorizer.fit_transform(documents)
81
+
82
  cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
83
  return cosine_sim[0][0]
84
 
 
86
  def doc2vec_cosine_similarity(resume, jd, model):
87
  resume_vector = model.infer_vector(resume.split())
88
  jd_vector = model.infer_vector(jd.split())
89
+
90
  cosine_sim = cosine_similarity([resume_vector], [jd_vector])
91
  return cosine_sim[0][0]
92
 
 
109
 
110
  documents.append(preprocess_text(job_description))
111
  tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
112
+ model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
113
 
114
  for file in resume_files:
115
  text = extract_text_from_pdf(file.name)
 
131
  common_keywords = set(jd_keywords).intersection(set(resume_keywords))
132
  keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
133
  tfidf_score = tfidf_cosine_similarity(text, job_description)
134
+ doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model)
 
135
 
136
  data.append({
137
  'Name': name,
138
  'Keyword_Match_Score': keyword_match_score, # Whole number
139
  'TFIDF_Score': tfidf_score,
140
  'Doc2Vec_Score': doc2vec_score,
 
141
  'Years_of_Experience': years_of_experience,
142
  'Feedback': '; '.join(feedback), # Combine feedback into a single string
143
  })
 
177
  )
178
 
179
  # Launch the Gradio interface
180
+ iface.launch()