File size: 5,315 Bytes
b0df9c3
 
 
 
8f2afd2
b0df9c3
96d38bc
 
 
 
 
b0df9c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af5c171
0e2eef4
4b01813
0e2eef4
8981128
 
 
0e2eef4
04ff643
0e2eef4
04ff643
 
 
 
0e2eef4
 
 
 
 
553973a
8981128
4b01813
 
 
8981128
 
 
 
 
4b01813
 
 
8981128
 
 
4b01813
 
 
 
 
 
8981128
 
 
4b01813
 
4f56513
4b01813
 
8981128
4b01813
 
 
 
6d0f67b
 
 
4b01813
8981128
dfd7959
8981128
 
 
 
 
 
 
6d0f67b
 
 
8981128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0f67b
 
 
 
 
b7aa2fc
 
96d38bc
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0f67b
 
 
 
 
 
96d38bc
 
 
 
 
 
 
 
 
6d0f67b
 
 
 
 
 
96d38bc
 
 
6d0f67b
 
 
8981128
 
4b01813
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pandas as pd
import streamlit as st 
import pandas as pd

from functions import *

from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, Record, Filter
from random import uniform




backgroundPattern = """
<style>
[data-testid="stAppViewContainer"] {
    background-color: #0E1117;
    opacity: 1;
    background-image: radial-gradient(#282C34 0.75px, #0E1117 0.75px);
    background-size: 15px 15px;
}
</style>
"""

st.markdown(backgroundPattern, unsafe_allow_html=True)

st.write("""
# Resume Screening & Classification
""")

st.header('Input')
jobs_data= job_desc_pdf()
resume_data= resume_pdf()

st.write('input to df:')
st.write(jobs_data)
st.write(resume_data)

# setup_nltk_resources()

# # Unzip wordnet
# corpora_path = "/kaggle/working/nltk_data/corpora"
# wordnet_zip = os.path.join(corpora_path, "wordnet.zip")
# unzip_nltk_resource(wordnet_zip, corpora_path)

# Apply preprocessing
jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text)
jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description')



resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text)
resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume')

st.write("CLEANED")
st.write(jobs_data_cleaned)
st.write(resume_data_cleaned)


jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description')
resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume')




# Dropping unnecessary columns from jobs data
jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']]

# Dropping unnecessary columns from resume data
resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']]

st.write("CLEANED WITH TOKENS")
st.write(jobs_data_final)
st.write(resume_data_final)

summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo")
st.write("sum")

# Summariz jobs description
jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description')

# Summarize all 'processed_resume' in resume_data_final
resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')

st.write("SUMMARISED")
st.write(jobs_data_summarized)
st.write(resume_data_summarized)

# Example Usage
encoder = SentenceTransformerEncoder("all-MiniLM-L6-v2")

# Encoding the summarized job descriptions
jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description')

# Encoding the summarized resumes
resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume')

st.write("SUMMARISED AND ENCODED")
st.write(jobs_data_summarized_and_encoded)
st.write(resume_data_summarized_and_encoded)

# Combine the jobs data
jobs_combined = pd.merge(
    jobs_data_final,
    jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']],
    left_index=True, right_index=True)

# Combine the resume data
resume_combined = pd.merge(
    resume_data_final,
    resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']],
    left_index=True, right_index=True)

# Reset index of DataFrame
jobs_combined.reset_index(drop=True, inplace=True)
resume_combined.reset_index(drop=True, inplace=True)


st.write("SUMMARISED AND ENCODED")
st.write(jobs_combined)
st.write(resume_combined)


#QDRANT VECTORIZER

vector_dimension = encoder.model.get_sentence_embedding_dimension()
qdrant_interface = QdrantInterface(QUADRANT_ENDPOINT, QUADRANT_API_KEY, vector_dimension)
qdrant_interface.create_collection('jobs', Distance.COSINE)
qdrant_interface.create_collection('resumes', Distance.COSINE)

# Function to ensure vectors are in list format
def ensure_list_format(df, vector_col):
    df[vector_col] = df[vector_col].apply(lambda x: x.tolist() if hasattr(x, 'tolist') else x)
    return df

# Ensure vectors are in the correct format before uploading
jobs_combined = ensure_list_format(jobs_combined, 'summarized_description_encoded')
resume_combined = ensure_list_format(resume_combined, 'summarized_resume_encoded')


st.write("LIST FORMAT")
st.write(jobs_combined)
st.write(resume_combined)


given_job_vector = jobs_combined['summarized_description_encoded'].iloc[0]

# Now upload to Qdrant
qdrant_interface.save_to_qdrant(jobs_combined, 'jobs', 'summarized_description_encoded', ['processed_title', 'processed_description', 'token_count', 'summarized_description'])
qdrant_interface.save_to_qdrant(resume_combined, 'resumes', 'summarized_resume_encoded', ['processed_resume', 'token_count', 'summarized_resume'])

# Retrieve specific records by IDs from the 'jobs' collection
specific_jobs_records = qdrant_interface.retrieve_specific_records('jobs', ids=[1])


st.write("SPECIFIC JOB RECS")
st.write(specific_jobs_records)



# Find top 5 matching resumes for the example job
matched_resumes = qdrant_interface.match_jobs_to_resumes(given_job_vector, top_k=5)
for resume, score in matched_resumes:
    st.write(f"Matched Resume: {resume['summarized_resume']}, Score: {score}")