Spaces:
Sleeping
Sleeping
File size: 5,315 Bytes
b0df9c3 8f2afd2 b0df9c3 96d38bc b0df9c3 af5c171 0e2eef4 4b01813 0e2eef4 8981128 0e2eef4 04ff643 0e2eef4 04ff643 0e2eef4 553973a 8981128 4b01813 8981128 4b01813 8981128 4b01813 8981128 4b01813 4f56513 4b01813 8981128 4b01813 6d0f67b 4b01813 8981128 dfd7959 8981128 6d0f67b 8981128 6d0f67b b7aa2fc 96d38bc 6d0f67b 96d38bc 6d0f67b 96d38bc 6d0f67b 8981128 4b01813 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import pandas as pd
import streamlit as st
import pandas as pd
from functions import *
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, Record, Filter
from random import uniform
backgroundPattern = """
<style>
[data-testid="stAppViewContainer"] {
background-color: #0E1117;
opacity: 1;
background-image: radial-gradient(#282C34 0.75px, #0E1117 0.75px);
background-size: 15px 15px;
}
</style>
"""
st.markdown(backgroundPattern, unsafe_allow_html=True)
st.write("""
# Resume Screening & Classification
""")
st.header('Input')
jobs_data= job_desc_pdf()
resume_data= resume_pdf()
st.write('input to df:')
st.write(jobs_data)
st.write(resume_data)
# setup_nltk_resources()
# # Unzip wordnet
# corpora_path = "/kaggle/working/nltk_data/corpora"
# wordnet_zip = os.path.join(corpora_path, "wordnet.zip")
# unzip_nltk_resource(wordnet_zip, corpora_path)
# Apply preprocessing
jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text)
jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description')
resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text)
resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume')
st.write("CLEANED")
st.write(jobs_data_cleaned)
st.write(resume_data_cleaned)
jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description')
resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume')
# Dropping unnecessary columns from jobs data
jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']]
# Dropping unnecessary columns from resume data
resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']]
st.write("CLEANED WITH TOKENS")
st.write(jobs_data_final)
st.write(resume_data_final)
summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo")
st.write("sum")
# Summariz jobs description
jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description')
# Summarize all 'processed_resume' in resume_data_final
resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')
st.write("SUMMARISED")
st.write(jobs_data_summarized)
st.write(resume_data_summarized)
# Example Usage
encoder = SentenceTransformerEncoder("all-MiniLM-L6-v2")
# Encoding the summarized job descriptions
jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description')
# Encoding the summarized resumes
resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume')
st.write("SUMMARISED AND ENCODED")
st.write(jobs_data_summarized_and_encoded)
st.write(resume_data_summarized_and_encoded)
# Combine the jobs data
jobs_combined = pd.merge(
jobs_data_final,
jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']],
left_index=True, right_index=True)
# Combine the resume data
resume_combined = pd.merge(
resume_data_final,
resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']],
left_index=True, right_index=True)
# Reset index of DataFrame
jobs_combined.reset_index(drop=True, inplace=True)
resume_combined.reset_index(drop=True, inplace=True)
st.write("SUMMARISED AND ENCODED")
st.write(jobs_combined)
st.write(resume_combined)
#QDRANT VECTORIZER
vector_dimension = encoder.model.get_sentence_embedding_dimension()
qdrant_interface = QdrantInterface(QUADRANT_ENDPOINT, QUADRANT_API_KEY, vector_dimension)
qdrant_interface.create_collection('jobs', Distance.COSINE)
qdrant_interface.create_collection('resumes', Distance.COSINE)
# Function to ensure vectors are in list format
def ensure_list_format(df, vector_col):
df[vector_col] = df[vector_col].apply(lambda x: x.tolist() if hasattr(x, 'tolist') else x)
return df
# Ensure vectors are in the correct format before uploading
jobs_combined = ensure_list_format(jobs_combined, 'summarized_description_encoded')
resume_combined = ensure_list_format(resume_combined, 'summarized_resume_encoded')
st.write("LIST FORMAT")
st.write(jobs_combined)
st.write(resume_combined)
given_job_vector = jobs_combined['summarized_description_encoded'].iloc[0]
# Now upload to Qdrant
qdrant_interface.save_to_qdrant(jobs_combined, 'jobs', 'summarized_description_encoded', ['processed_title', 'processed_description', 'token_count', 'summarized_description'])
qdrant_interface.save_to_qdrant(resume_combined, 'resumes', 'summarized_resume_encoded', ['processed_resume', 'token_count', 'summarized_resume'])
# Retrieve specific records by IDs from the 'jobs' collection
specific_jobs_records = qdrant_interface.retrieve_specific_records('jobs', ids=[1])
st.write("SPECIFIC JOB RECS")
st.write(specific_jobs_records)
# Find top 5 matching resumes for the example job
matched_resumes = qdrant_interface.match_jobs_to_resumes(given_job_vector, top_k=5)
for resume, score in matched_resumes:
st.write(f"Matched Resume: {resume['summarized_resume']}, Score: {score}")
|