Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import streamlit as st | |
| import pandas as pd | |
| from functions import * | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.http.models import VectorParams, Distance, Record, Filter | |
| from random import uniform | |
| backgroundPattern = """ | |
| <style> | |
| [data-testid="stAppViewContainer"] { | |
| background-color: #0E1117; | |
| opacity: 1; | |
| background-image: radial-gradient(#282C34 0.75px, #0E1117 0.75px); | |
| background-size: 15px 15px; | |
| } | |
| </style> | |
| """ | |
| st.markdown(backgroundPattern, unsafe_allow_html=True) | |
| st.write(""" | |
| # Resume Screening & Classification | |
| """) | |
| st.header('Input') | |
| jobs_data= job_desc_pdf() | |
| resume_data= resume_pdf() | |
| st.write('input to df:') | |
| st.write(jobs_data) | |
| st.write(resume_data) | |
| # setup_nltk_resources() | |
| # # Unzip wordnet | |
| # corpora_path = "/kaggle/working/nltk_data/corpora" | |
| # wordnet_zip = os.path.join(corpora_path, "wordnet.zip") | |
| # unzip_nltk_resource(wordnet_zip, corpora_path) | |
| # Apply preprocessing | |
| jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text) | |
| jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description') | |
| resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text) | |
| resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume') | |
| st.write("CLEANED") | |
| st.write(jobs_data_cleaned) | |
| st.write(resume_data_cleaned) | |
| jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description') | |
| resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume') | |
| # Dropping unnecessary columns from jobs data | |
| jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']] | |
| # Dropping unnecessary columns from resume data | |
| resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']] | |
| st.write("CLEANED WITH TOKENS") | |
| st.write(jobs_data_final) | |
| st.write(resume_data_final) | |
| summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo") | |
| st.write("sum") | |
| # Summariz jobs description | |
| jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description') | |
| # Summarize all 'processed_resume' in resume_data_final | |
| resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume') | |
| st.write("SUMMARISED") | |
| st.write(jobs_data_summarized) | |
| st.write(resume_data_summarized) | |
| # Example Usage | |
| encoder = SentenceTransformerEncoder("all-MiniLM-L6-v2") | |
| # Encoding the summarized job descriptions | |
| jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description') | |
| # Encoding the summarized resumes | |
| resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume') | |
| st.write("SUMMARISED AND ENCODED") | |
| st.write(jobs_data_summarized_and_encoded) | |
| st.write(resume_data_summarized_and_encoded) | |
| # Combine the jobs data | |
| jobs_combined = pd.merge( | |
| jobs_data_final, | |
| jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']], | |
| left_index=True, right_index=True) | |
| # Combine the resume data | |
| resume_combined = pd.merge( | |
| resume_data_final, | |
| resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']], | |
| left_index=True, right_index=True) | |
| # Reset index of DataFrame | |
| jobs_combined.reset_index(drop=True, inplace=True) | |
| resume_combined.reset_index(drop=True, inplace=True) | |
| st.write("SUMMARISED AND ENCODED") | |
| st.write(jobs_combined) | |
| st.write(resume_combined) | |
| #QDRANT VECTORIZER | |
| vector_dimension = encoder.model.get_sentence_embedding_dimension() | |
| qdrant_interface = QdrantInterface(QUADRANT_ENDPOINT, QUADRANT_API_KEY, vector_dimension) | |
| qdrant_interface.create_collection('jobs', Distance.COSINE) | |
| qdrant_interface.create_collection('resumes', Distance.COSINE) | |
| # Function to ensure vectors are in list format | |
| def ensure_list_format(df, vector_col): | |
| df[vector_col] = df[vector_col].apply(lambda x: x.tolist() if hasattr(x, 'tolist') else x) | |
| return df | |
| # Ensure vectors are in the correct format before uploading | |
| jobs_combined = ensure_list_format(jobs_combined, 'summarized_description_encoded') | |
| resume_combined = ensure_list_format(resume_combined, 'summarized_resume_encoded') | |
| st.write("LIST FORMAT") | |
| st.write(jobs_combined) | |
| st.write(resume_combined) | |
| given_job_vector = jobs_combined['summarized_description_encoded'].iloc[0] | |
| # Now upload to Qdrant | |
| qdrant_interface.save_to_qdrant(jobs_combined, 'jobs', 'summarized_description_encoded', ['processed_title', 'processed_description', 'token_count', 'summarized_description']) | |
| qdrant_interface.save_to_qdrant(resume_combined, 'resumes', 'summarized_resume_encoded', ['processed_resume', 'token_count', 'summarized_resume']) | |
| # Retrieve specific records by IDs from the 'jobs' collection | |
| specific_jobs_records = qdrant_interface.retrieve_specific_records('jobs', ids=[1]) | |
| st.write("SPECIFIC JOB RECS") | |
| st.write(specific_jobs_records) | |
| # Find top 5 matching resumes for the example job | |
| matched_resumes = qdrant_interface.match_jobs_to_resumes(given_job_vector, top_k=5) | |
| for resume, score in matched_resumes: | |
| st.write(f"Matched Resume: {resume['summarized_resume']}, Score: {score}") | |