import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os import nltk import zipfile import os from bs4 import BeautifulSoup import re from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from transformers import BartForConditionalGeneration, BartTokenizer import torch from tqdm import tqdm from sentence_transformers import SentenceTransformer from qdrant_client import QdrantClient from qdrant_client.http.models import VectorParams, Distance, Record, Filter from random import uniform import PyPDF2 import streamlit as st nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') def setup_nltk_resources(): """ Sets up the custom NLTK data path and downloads necessary resources. Downloads 'wordnet' for lemmatization, 'stopwords' for stopwords removal, and 'punkt' for sentence tokenization. """ nltk_data_path = "/kaggle/working/nltk_data" nltk.data.path.append(nltk_data_path) nltk.download('wordnet', download_dir=nltk_data_path) nltk.download('stopwords', download_dir=nltk_data_path) nltk.download('punkt', download_dir=nltk_data_path) def unzip_nltk_resource(zip_path, extract_to): """ Unzips an NLTK resource file to a specified directory. Args: zip_path (str): The path to the zipped NLTK resource file. extract_to (str): The directory where the contents of the zip file will be extracted. """ with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_to) def preprocess_text(text): """ Preprocesses a given text string for NLP tasks. This includes cleaning the text, tokenizing, removing stopwords, and lemmatizing the words. Args: text (str): The text string to preprocess. Returns: str: The preprocessed text. """ if not text: return "" text = re.sub(r'[\r\n\t]+', ' ', text) text = re.sub(r'[^a-zA-Z\s]', '', text) text = text.lower() tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) filtered_tokens = [word for word in tokens if word not in stop_words] lemmatizer = WordNetLemmatizer() lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens] return ' '.join(lemmatized_text) def drop_duplicates(df, column_name): """ Drops duplicates based on a specified column from the DataFrame. Args: df (pd.DataFrame): The DataFrame from which to remove duplicates. column_name (str): The name of the column based on which duplicates will be identified. Returns: pd.DataFrame: DataFrame with duplicates removed based on the specified column. """ if column_name not in df.columns: raise ValueError(f"Column '{column_name}' not found in DataFrame") original_size = df.shape[0] df_cleaned = df.drop_duplicates(subset=[column_name]) new_size = df_cleaned.shape[0] print(f"Dropped {original_size - new_size} duplicates from '{column_name}'. New dataset size: {new_size}") return df_cleaned def add_token_count_column(df, column_name): """ Adds a new column to the DataFrame with the token count for each entry in the specified column. This function creates a copy of the DataFrame to avoid 'SettingWithCopyWarning'. Args: df (pd.DataFrame): The DataFrame to process. column_name (str): The name of the column for which to count tokens. Returns: pd.DataFrame: DataFrame with an additional column 'token_count'. """ if column_name not in df.columns: raise ValueError(f"Column '{column_name}' not found in DataFrame") # Creating a copy of the DataFrame to avoid modifying a slice df_copy = df.copy() # Tokenize each entry in the specified column and count the number of tokens df_copy['token_count'] = df_copy[column_name].apply(lambda x: len(word_tokenize(x)) if pd.notnull(x) else 0) return df_copy class TextSummarizer: """ A text summarization class that uses a fine-tuned BART model to summarize text. Attributes: device (str): Device to run the model on, either 'cuda' or 'cpu'. model (BartForConditionalGeneration): The loaded BART model. tokenizer (BartTokenizer): The tokenizer for the BART model. """ def __init__(self, model_name): """ Initializes the TextSummarizer with a specified BART model. Args: model_name (str): The name or path of the fine-tuned BART model. """ self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device) self.tokenizer = BartTokenizer.from_pretrained(model_name) def summarize(self, text, max_input_length=1024, max_output_length=150, min_output_length=40): """ Summarizes the given text using the fine-tuned BART model. Args: text (str): The text to be summarized. max_input_length (int): The maximum length of the input text in tokens. max_output_length (int): The maximum length of the summary text in tokens. min_output_length (int): The minimum length of the summary text in tokens. Returns: str: The summarized text. """ inputs = self.tokenizer([text], max_length=max_input_length, return_tensors='pt', truncation=True) summary_ids = self.model.generate( inputs['input_ids'].to(self.device), max_length=max_output_length, min_length=min_output_length, length_penalty=2.0, num_beams=4, early_stopping=True ) return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) def batch_summarize(df, text_col, summarizer, batch_size=10, output_col=None): """ Summarizes text in batches. Args: df (pd.DataFrame): The DataFrame containing text to summarize. text_col (str): The column in the DataFrame with text to summarize. summarizer: The summarizer object or function. batch_size (int): The size of each batch for summarization. output_col (str, optional): The name of the output column for summarized text. If None, defaults to text_col. Returns: pd.DataFrame: DataFrame with summarized text in the specified output column. """ summarized_texts = [] # Use the text_col as output_col if not specified if output_col is None: output_col = text_col # Iterate through the DataFrame in batches for start_idx in tqdm(range(0, len(df), batch_size), desc="Summarizing"): end_idx = start_idx + batch_size batch = df[text_col][start_idx:end_idx] # Summarize each batch summarized_batch = [summarizer.summarize(text) for text in batch] summarized_texts.extend(summarized_batch) # Create a new DataFrame with the summarized text return pd.DataFrame({output_col: summarized_texts}) class SentenceTransformerEncoder: """ A class to handle sentence encoding using Sentence Transformers, directly working with pandas DataFrames. This class encodes text data in a specified DataFrame column into vector representations. Attributes: model (SentenceTransformer): The Sentence Transformer model used for encoding. """ def __init__(self, model_name='all-MiniLM-L6-v2'): """ Initializes the SentenceTransformerEncoder with a specified Sentence Transformer model. Args: model_name (str): The name of the Sentence Transformer model. """ self.model = SentenceTransformer(model_name) def encode_column(self, df, column, batch_size=32, encoded_column_suffix='_encoded'): """ Encodes a specific column in a DataFrame and adds a new column with encoded vectors. Args: df (pd.DataFrame): The DataFrame containing the texts to encode. column (str): The name of the column to encode. batch_size (int): The size of each batch for processing. encoded_column_suffix (str): Suffix for the new column containing encoded vectors. Returns: pd.DataFrame: The original DataFrame with an additional column containing encoded vectors. Raises: ValueError: If the specified column is not found in the DataFrame. """ if column not in df.columns: raise ValueError(f"Column '{column}' not found in DataFrame") # Encoding the text data in batches encoded_vectors = [] for start_idx in range(0, len(df), batch_size): end_idx = min(start_idx + batch_size, len(df)) batch_texts = df[column][start_idx:end_idx].tolist() batch_encoded = self.model.encode(batch_texts, show_progress_bar=True) encoded_vectors.extend(batch_encoded) # Adding the encoded vectors as a new column in the DataFrame df[column + encoded_column_suffix] = encoded_vectors return df class QdrantInterface: """ A class for interfacing with the Qdrant vector database. Attributes: client (QdrantClient): Client instance for interacting with Qdrant. vector_dimension (int): Dimension of the vectors used in the collection. """ """ A class for interfacing with the Qdrant vector database. ... """ def __init__(self, url, api_key, vector_dimension): """ Initializes the QdrantInterface with the specified Qdrant URL, API key, and vector dimension. Args: url (str): Full URL of the Qdrant server. api_key (str): API key for Qdrant. vector_dimension (int): Dimension of vectors to be stored in Qdrant. """ self.client = QdrantClient(url=url, api_key=api_key) self.vector_dimension = vector_dimension def create_collection(self, collection_name, distance_metric=Distance.COSINE): """ Creates or recreates a collection in Qdrant. Args: collection_name (str): Name of the collection. distance_metric (Distance): Distance metric for vector comparisons. """ self.client.recreate_collection( collection_name=collection_name, vectors_config=VectorParams(size=self.vector_dimension, distance=distance_metric) ) def save_to_qdrant(self, df, collection_name, vector_col, payload_cols, batch_size=100): """ Saves a DataFrame to Qdrant in batches. Args: df (pd.DataFrame): DataFrame containing data to save. collection_name (str): Name of the collection in Qdrant. vector_col (str): Name of the column containing vectors. payload_cols (list[str]): List of column names to include as payload. batch_size (int): Number of records to process in each batch. """ for start_idx in range(0, len(df), batch_size): end_idx = min(start_idx + batch_size, len(df)) batch = df.iloc[start_idx:end_idx] records = [] for idx, row in batch.iterrows(): # Debug print print(f"Index: {idx}, Vector Type: {type(row[vector_col])}, First 10 Elements: {row[vector_col][:10]}") record = Record( id=idx, vector=row[vector_col], payload={col: row[col] for col in payload_cols} ) records.append(record) self.client.upload_records(collection_name=collection_name, records=records) def retrieve_specific_records(self, collection_name, ids): """ Retrieves specific records by their IDs from a Qdrant collection. Args: collection_name (str): The name of the collection. ids (list): List of record IDs to retrieve. Returns: List of specific records from the collection. """ return self.client.retrieve(collection_name=collection_name, ids=ids) def view_sample_records(self, collection_name, vector_dimension, limit=10): """ Retrieves a sample of records from a Qdrant collection using a dummy search. Args: collection_name (str): The name of the collection. vector_dimension (int): Dimension of vectors in the collection. limit (int): The number of records to retrieve. Returns: List of sample records from the collection. """ # Generate a random vector random_vector = [uniform(-1, 1) for _ in range(vector_dimension)] # Perform a dummy search return self.client.search( collection_name=collection_name, query_vector=random_vector, limit=limit ) def match_resumes_to_jobs(self, resume_vector, top_k=10): """ Matches a given resume vector to job postings. Args: resume_vector (list): The vector representation of a resume. top_k (int): Number of top similar matches to return. Returns: List of matched job postings with similarity scores. """ hits = self.client.search( collection_name="jobs", query_vector=resume_vector, limit=top_k, with_payload=True ) return [(hit.payload, hit.score) for hit in hits] def match_jobs_to_resumes(self, job_vector, top_k=10): """ Matches a given job vector to resumes. Args: job_vector (list): The vector representation of a job posting. top_k (int): Number of top similar matches to return. Returns: List of tuples containing matched resumes and their similarity scores. """ hits = self.client.search( collection_name="resumes", query_vector=job_vector, limit=top_k, with_payload=True ) return [(hit.payload, hit.score) for hit in hits] def extract_text_from_pdf(file): """ Extract text from a PDF file using PyPDF2 library. """ text = "" try: pdf_reader = PyPDF2.PdfReader(file) num_pages = len(pdf_reader.pages) for page_num in range(num_pages): page = pdf_reader.pages[page_num] text += page.extract_text() except Exception as e: st.error(f"Error extracting text from PDF: {e}") return text def resume_pdf(): st.title("UPLOAD RESUMES") # Allow user to upload multiple PDF files uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type="pdf") if uploaded_files: st.write("## Extracted Text from PDFs") df_rows = [] # Iterate over uploaded PDF files for idx, uploaded_file in enumerate(uploaded_files): text = extract_text_from_pdf(uploaded_file) # Add text to DataFrame df_rows.append({"File Name": f"File_{idx+1}", "Resume": text}) # # Display extracted text # st.write(f"### File {idx+1}") # st.write(text) # # Iterate over uploaded PDF files # for uploaded_file in uploaded_files: # text = extract_text_from_pdf(uploaded_file) # # Add text to DataFrame # df_rows.append({"File Name": uploaded_file.name, "Text": text.decode("utf-8")}) # # Display extracted text # st.write(f"### {uploaded_file.name}") # st.write(text) # Create DataFrame df = pd.DataFrame(df_rows) return df # # Display DataFrame # st.write("## Combined Data in DataFrame") # st.write(df) def job_desc_pdf(): st.title("UPLOAD JOB DESCRIPTION") uploaded_file = st.file_uploader("Upload PDF file", type="pdf") if uploaded_file: st.write("## Extracted Text from PDFs") text = extract_text_from_pdf(uploaded_file) df_rows = [] df_rows.append({"File Name": "Job_Desc", "description": text}) # # Iterate over uploaded PDF files # for uploaded_file in uploaded_files: # text = extract_text_from_pdf(uploaded_file) # # Add text to DataFrame # # # Display extracted text # st.write(f"### {uploaded_file.name}") # st.write(text) # Create DataFrame df = pd.DataFrame(df_rows) # # Display DataFrame # st.write("## Combined Data in DataFrame") # st.write(df) return df