Spaces:

manasvinid
/

RESUME_RANKER

Sleeping

App Files Files Community

RESUME_RANKER / app.py

manasvinid

Update app.py

dfd7959 verified over 1 year ago

raw

history blame contribute delete

5.32 kB

	import pandas as pd
	import streamlit as st
	import pandas as pd

	from functions import *

	from qdrant_client import QdrantClient
	from qdrant_client.http.models import VectorParams, Distance, Record, Filter
	from random import uniform




	backgroundPattern = """
	<style>
	[data-testid="stAppViewContainer"] {
	background-color: #0E1117;
	opacity: 1;
	background-image: radial-gradient(#282C34 0.75px, #0E1117 0.75px);
	background-size: 15px 15px;
	}
	</style>
	"""

	st.markdown(backgroundPattern, unsafe_allow_html=True)

	st.write("""
	# Resume Screening & Classification
	""")

	st.header('Input')
	jobs_data= job_desc_pdf()
	resume_data= resume_pdf()

	st.write('input to df:')
	st.write(jobs_data)
	st.write(resume_data)

	# setup_nltk_resources()

	# # Unzip wordnet
	# corpora_path = "/kaggle/working/nltk_data/corpora"
	# wordnet_zip = os.path.join(corpora_path, "wordnet.zip")
	# unzip_nltk_resource(wordnet_zip, corpora_path)

	# Apply preprocessing
	jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text)
	jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description')



	resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text)
	resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume')

	st.write("CLEANED")
	st.write(jobs_data_cleaned)
	st.write(resume_data_cleaned)


	jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description')
	resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume')




	# Dropping unnecessary columns from jobs data
	jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']]

	# Dropping unnecessary columns from resume data
	resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']]

	st.write("CLEANED WITH TOKENS")
	st.write(jobs_data_final)
	st.write(resume_data_final)

	summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo")
	st.write("sum")

	# Summariz jobs description
	jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description')

	# Summarize all 'processed_resume' in resume_data_final
	resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')

	st.write("SUMMARISED")
	st.write(jobs_data_summarized)
	st.write(resume_data_summarized)

	# Example Usage
	encoder = SentenceTransformerEncoder("all-MiniLM-L6-v2")

	# Encoding the summarized job descriptions
	jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description')

	# Encoding the summarized resumes
	resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume')

	st.write("SUMMARISED AND ENCODED")
	st.write(jobs_data_summarized_and_encoded)
	st.write(resume_data_summarized_and_encoded)

	# Combine the jobs data
	jobs_combined = pd.merge(
	jobs_data_final,
	jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']],
	left_index=True, right_index=True)

	# Combine the resume data
	resume_combined = pd.merge(
	resume_data_final,
	resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']],
	left_index=True, right_index=True)

	# Reset index of DataFrame
	jobs_combined.reset_index(drop=True, inplace=True)
	resume_combined.reset_index(drop=True, inplace=True)


	st.write("SUMMARISED AND ENCODED")
	st.write(jobs_combined)
	st.write(resume_combined)


	#QDRANT VECTORIZER

	vector_dimension = encoder.model.get_sentence_embedding_dimension()
	qdrant_interface = QdrantInterface(QUADRANT_ENDPOINT, QUADRANT_API_KEY, vector_dimension)
	qdrant_interface.create_collection('jobs', Distance.COSINE)
	qdrant_interface.create_collection('resumes', Distance.COSINE)

	# Function to ensure vectors are in list format
	def ensure_list_format(df, vector_col):
	df[vector_col] = df[vector_col].apply(lambda x: x.tolist() if hasattr(x, 'tolist') else x)
	return df

	# Ensure vectors are in the correct format before uploading
	jobs_combined = ensure_list_format(jobs_combined, 'summarized_description_encoded')
	resume_combined = ensure_list_format(resume_combined, 'summarized_resume_encoded')


	st.write("LIST FORMAT")
	st.write(jobs_combined)
	st.write(resume_combined)


	given_job_vector = jobs_combined['summarized_description_encoded'].iloc[0]

	# Now upload to Qdrant
	qdrant_interface.save_to_qdrant(jobs_combined, 'jobs', 'summarized_description_encoded', ['processed_title', 'processed_description', 'token_count', 'summarized_description'])
	qdrant_interface.save_to_qdrant(resume_combined, 'resumes', 'summarized_resume_encoded', ['processed_resume', 'token_count', 'summarized_resume'])

	# Retrieve specific records by IDs from the 'jobs' collection
	specific_jobs_records = qdrant_interface.retrieve_specific_records('jobs', ids=[1])


	st.write("SPECIFIC JOB RECS")
	st.write(specific_jobs_records)



	# Find top 5 matching resumes for the example job
	matched_resumes = qdrant_interface.match_jobs_to_resumes(given_job_vector, top_k=5)
	for resume, score in matched_resumes:
	st.write(f"Matched Resume: {resume['summarized_resume']}, Score: {score}")