Spaces:
Runtime error
Runtime error
athulnambiar
commited on
Commit
•
af2aec4
1
Parent(s):
6938be6
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
5 |
import re
|
6 |
from PyPDF2 import PdfReader
|
7 |
|
@@ -22,35 +24,44 @@ def clean_text(text):
|
|
22 |
text = re.sub(r'\W', ' ', text)
|
23 |
return text.lower()
|
24 |
|
25 |
-
def
|
26 |
tfidf_vectorizer = TfidfVectorizer()
|
27 |
tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
st.title("Resume Analyzer")
|
32 |
|
33 |
st.sidebar.subheader("Enter Keywords and Priority")
|
34 |
-
|
35 |
data = pd.DataFrame({
|
36 |
'Keyword': ['']*10,
|
37 |
'Priority': ['']*10
|
38 |
})
|
39 |
-
|
40 |
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")
|
41 |
|
42 |
if not keywords_df['Keyword'].isnull().all():
|
43 |
keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
|
44 |
-
|
45 |
st.subheader("Upload up to 5 resumes (PDF or Text files)")
|
46 |
uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
|
47 |
-
|
48 |
if len(uploaded_files) > 0 and keywords_combined:
|
49 |
with st.spinner("Analyzing Resumes..."):
|
50 |
resumes = []
|
51 |
for file in uploaded_files:
|
52 |
try:
|
53 |
-
|
54 |
resume_text = extract_text_from_file(file)
|
55 |
clean_resume = clean_text(resume_text)
|
56 |
resumes.append(clean_resume)
|
@@ -59,13 +70,26 @@ if not keywords_df['Keyword'].isnull().all():
|
|
59 |
|
60 |
clean_keywords = clean_text(keywords_combined)
|
61 |
|
62 |
-
|
63 |
|
64 |
st.subheader("Resume Analysis Results")
|
65 |
results_df = pd.DataFrame({
|
66 |
'Resume': [file.name for file in uploaded_files],
|
67 |
-
'Similarity
|
|
|
|
|
68 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
st.dataframe(results_df)
|
70 |
else:
|
71 |
-
st.info("Please upload resumes and enter keywords with priority.")
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
6 |
+
from sklearn.preprocessing import MinMaxScaler
|
7 |
import re
|
8 |
from PyPDF2 import PdfReader
|
9 |
|
|
|
24 |
text = re.sub(r'\W', ' ', text)
|
25 |
return text.lower()
|
26 |
|
27 |
+
def calculate_similarity_metrics(resumes, keywords):
|
28 |
tfidf_vectorizer = TfidfVectorizer()
|
29 |
tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
|
30 |
+
|
31 |
+
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
|
32 |
+
|
33 |
+
def jaccard_similarity(doc1, doc2):
|
34 |
+
set1 = set(doc1.split())
|
35 |
+
set2 = set(doc2.split())
|
36 |
+
return len(set1.intersection(set2)) / len(set1.union(set2))
|
37 |
+
|
38 |
+
jaccard_sim = [jaccard_similarity(keywords, resume) for resume in resumes]
|
39 |
+
|
40 |
+
euclidean_dist = euclidean_distances(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
|
41 |
+
euclidean_sim = 1 / (1 + euclidean_dist)
|
42 |
+
|
43 |
+
return cosine_sim, jaccard_sim, euclidean_sim
|
44 |
|
45 |
st.title("Resume Analyzer")
|
46 |
|
47 |
st.sidebar.subheader("Enter Keywords and Priority")
|
|
|
48 |
data = pd.DataFrame({
|
49 |
'Keyword': ['']*10,
|
50 |
'Priority': ['']*10
|
51 |
})
|
|
|
52 |
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")
|
53 |
|
54 |
if not keywords_df['Keyword'].isnull().all():
|
55 |
keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
|
56 |
+
|
57 |
st.subheader("Upload up to 5 resumes (PDF or Text files)")
|
58 |
uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
|
59 |
+
|
60 |
if len(uploaded_files) > 0 and keywords_combined:
|
61 |
with st.spinner("Analyzing Resumes..."):
|
62 |
resumes = []
|
63 |
for file in uploaded_files:
|
64 |
try:
|
|
|
65 |
resume_text = extract_text_from_file(file)
|
66 |
clean_resume = clean_text(resume_text)
|
67 |
resumes.append(clean_resume)
|
|
|
70 |
|
71 |
clean_keywords = clean_text(keywords_combined)
|
72 |
|
73 |
+
cosine_scores, jaccard_scores, euclidean_scores = calculate_similarity_metrics(resumes, clean_keywords)
|
74 |
|
75 |
st.subheader("Resume Analysis Results")
|
76 |
results_df = pd.DataFrame({
|
77 |
'Resume': [file.name for file in uploaded_files],
|
78 |
+
'Cosine Similarity': cosine_scores,
|
79 |
+
'Jaccard Index': jaccard_scores,
|
80 |
+
'Euclidean Similarity': euclidean_scores
|
81 |
})
|
82 |
+
|
83 |
+
scaler = MinMaxScaler()
|
84 |
+
normalized_scores = scaler.fit_transform(results_df[['Cosine Similarity', 'Jaccard Index', 'Euclidean Similarity']])
|
85 |
+
|
86 |
+
overall_scores = np.mean(normalized_scores, axis=1)
|
87 |
+
results_df['Overall Score'] = overall_scores
|
88 |
+
|
89 |
+
results_df['Rank'] = results_df['Overall Score'].rank(ascending=False, method='min').astype(int)
|
90 |
+
|
91 |
+
results_df = results_df.sort_values('Rank')
|
92 |
+
|
93 |
st.dataframe(results_df)
|
94 |
else:
|
95 |
+
st.info("Please upload resumes and enter keywords with priority.")
|