Spaces:

manasvinid
/

RESUME_RANKER

Sleeping

App Files Files Community

RESUME_RANKER / functions.py

manasvinid

Update functions.py

876dea7 verified over 1 year ago

raw

history blame contribute delete

16.8 kB

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
	import os
	import nltk
	import zipfile
	import os
	from bs4 import BeautifulSoup
	import re
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from transformers import BartForConditionalGeneration, BartTokenizer
	import torch
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer

	from qdrant_client import QdrantClient
	from qdrant_client.http.models import VectorParams, Distance, Record, Filter
	from random import uniform
	import PyPDF2
	import streamlit as st

	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')

	def setup_nltk_resources():
	"""
	Sets up the custom NLTK data path and downloads necessary resources.
	Downloads 'wordnet' for lemmatization, 'stopwords' for stopwords removal,
	and 'punkt' for sentence tokenization.
	"""
	nltk_data_path = "/kaggle/working/nltk_data"
	nltk.data.path.append(nltk_data_path)

	nltk.download('wordnet', download_dir=nltk_data_path)
	nltk.download('stopwords', download_dir=nltk_data_path)
	nltk.download('punkt', download_dir=nltk_data_path)

	def unzip_nltk_resource(zip_path, extract_to):
	"""
	Unzips an NLTK resource file to a specified directory.

	Args:
	zip_path (str): The path to the zipped NLTK resource file.
	extract_to (str): The directory where the contents of the zip file will be extracted.
	"""
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(extract_to)


	def preprocess_text(text):
	"""
	Preprocesses a given text string for NLP tasks. This includes cleaning the text,
	tokenizing, removing stopwords, and lemmatizing the words.

	Args:
	text (str): The text string to preprocess.

	Returns:
	str: The preprocessed text.
	"""
	if not text:
	return ""
	text = re.sub(r'[\r\n\t]+', ' ', text)
	text = re.sub(r'[^a-zA-Z\s]', '', text)
	text = text.lower()

	tokens = word_tokenize(text)
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [word for word in tokens if word not in stop_words]

	lemmatizer = WordNetLemmatizer()
	lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]

	return ' '.join(lemmatized_text)


	def drop_duplicates(df, column_name):
	"""
	Drops duplicates based on a specified column from the DataFrame.

	Args:
	df (pd.DataFrame): The DataFrame from which to remove duplicates.
	column_name (str): The name of the column based on which duplicates will be identified.

	Returns:
	pd.DataFrame: DataFrame with duplicates removed based on the specified column.
	"""
	if column_name not in df.columns:
	raise ValueError(f"Column '{column_name}' not found in DataFrame")

	original_size = df.shape[0]
	df_cleaned = df.drop_duplicates(subset=[column_name])
	new_size = df_cleaned.shape[0]

	print(f"Dropped {original_size - new_size} duplicates from '{column_name}'. New dataset size: {new_size}")

	return df_cleaned

	def add_token_count_column(df, column_name):
	"""
	Adds a new column to the DataFrame with the token count for each entry in the specified column.
	This function creates a copy of the DataFrame to avoid 'SettingWithCopyWarning'.

	Args:
	df (pd.DataFrame): The DataFrame to process.
	column_name (str): The name of the column for which to count tokens.

	Returns:
	pd.DataFrame: DataFrame with an additional column 'token_count'.
	"""
	if column_name not in df.columns:
	raise ValueError(f"Column '{column_name}' not found in DataFrame")

	# Creating a copy of the DataFrame to avoid modifying a slice
	df_copy = df.copy()

	# Tokenize each entry in the specified column and count the number of tokens
	df_copy['token_count'] = df_copy[column_name].apply(lambda x: len(word_tokenize(x)) if pd.notnull(x) else 0)

	return df_copy


	class TextSummarizer:
	"""
	A text summarization class that uses a fine-tuned BART model to summarize text.

	Attributes:
	device (str): Device to run the model on, either 'cuda' or 'cpu'.
	model (BartForConditionalGeneration): The loaded BART model.
	tokenizer (BartTokenizer): The tokenizer for the BART model.
	"""

	def __init__(self, model_name):
	"""
	Initializes the TextSummarizer with a specified BART model.

	Args:
	model_name (str): The name or path of the fine-tuned BART model.
	"""
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
	self.tokenizer = BartTokenizer.from_pretrained(model_name)

	def summarize(self, text, max_input_length=1024, max_output_length=150, min_output_length=40):
	"""
	Summarizes the given text using the fine-tuned BART model.

	Args:
	text (str): The text to be summarized.
	max_input_length (int): The maximum length of the input text in tokens.
	max_output_length (int): The maximum length of the summary text in tokens.
	min_output_length (int): The minimum length of the summary text in tokens.

	Returns:
	str: The summarized text.
	"""
	inputs = self.tokenizer([text], max_length=max_input_length, return_tensors='pt', truncation=True)
	summary_ids = self.model.generate(
	inputs['input_ids'].to(self.device),
	max_length=max_output_length,
	min_length=min_output_length,
	length_penalty=2.0,
	num_beams=4,
	early_stopping=True
	)
	return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)



	def batch_summarize(df, text_col, summarizer, batch_size=10, output_col=None):
	"""
	Summarizes text in batches.

	Args:
	df (pd.DataFrame): The DataFrame containing text to summarize.
	text_col (str): The column in the DataFrame with text to summarize.
	summarizer: The summarizer object or function.
	batch_size (int): The size of each batch for summarization.
	output_col (str, optional): The name of the output column for summarized text.
	If None, defaults to text_col.

	Returns:
	pd.DataFrame: DataFrame with summarized text in the specified output column.
	"""
	summarized_texts = []

	# Use the text_col as output_col if not specified
	if output_col is None:
	output_col = text_col

	# Iterate through the DataFrame in batches
	for start_idx in tqdm(range(0, len(df), batch_size), desc="Summarizing"):
	end_idx = start_idx + batch_size
	batch = df[text_col][start_idx:end_idx]

	# Summarize each batch
	summarized_batch = [summarizer.summarize(text) for text in batch]
	summarized_texts.extend(summarized_batch)

	# Create a new DataFrame with the summarized text
	return pd.DataFrame({output_col: summarized_texts})


	class SentenceTransformerEncoder:
	"""
	A class to handle sentence encoding using Sentence Transformers, directly working with pandas DataFrames.
	This class encodes text data in a specified DataFrame column into vector representations.

	Attributes:
	model (SentenceTransformer): The Sentence Transformer model used for encoding.
	"""

	def __init__(self, model_name='all-MiniLM-L6-v2'):
	"""
	Initializes the SentenceTransformerEncoder with a specified Sentence Transformer model.

	Args:
	model_name (str): The name of the Sentence Transformer model.
	"""
	self.model = SentenceTransformer(model_name)

	def encode_column(self, df, column, batch_size=32, encoded_column_suffix='_encoded'):
	"""
	Encodes a specific column in a DataFrame and adds a new column with encoded vectors.

	Args:
	df (pd.DataFrame): The DataFrame containing the texts to encode.
	column (str): The name of the column to encode.
	batch_size (int): The size of each batch for processing.
	encoded_column_suffix (str): Suffix for the new column containing encoded vectors.

	Returns:
	pd.DataFrame: The original DataFrame with an additional column containing encoded vectors.

	Raises:
	ValueError: If the specified column is not found in the DataFrame.
	"""
	if column not in df.columns:
	raise ValueError(f"Column '{column}' not found in DataFrame")

	# Encoding the text data in batches
	encoded_vectors = []
	for start_idx in range(0, len(df), batch_size):
	end_idx = min(start_idx + batch_size, len(df))
	batch_texts = df[column][start_idx:end_idx].tolist()
	batch_encoded = self.model.encode(batch_texts, show_progress_bar=True)
	encoded_vectors.extend(batch_encoded)

	# Adding the encoded vectors as a new column in the DataFrame
	df[column + encoded_column_suffix] = encoded_vectors
	return df

	class QdrantInterface:
	"""
	A class for interfacing with the Qdrant vector database.

	Attributes:
	client (QdrantClient): Client instance for interacting with Qdrant.
	vector_dimension (int): Dimension of the vectors used in the collection.
	"""

	"""
	A class for interfacing with the Qdrant vector database.
	...
	"""
	def __init__(self, url, api_key, vector_dimension):
	"""
	Initializes the QdrantInterface with the specified Qdrant URL, API key, and vector dimension.

	Args:
	url (str): Full URL of the Qdrant server.
	api_key (str): API key for Qdrant.
	vector_dimension (int): Dimension of vectors to be stored in Qdrant.
	"""
	self.client = QdrantClient(url=url, api_key=api_key)
	self.vector_dimension = vector_dimension
	def create_collection(self, collection_name, distance_metric=Distance.COSINE):
	"""
	Creates or recreates a collection in Qdrant.

	Args:
	collection_name (str): Name of the collection.
	distance_metric (Distance): Distance metric for vector comparisons.
	"""
	self.client.recreate_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=self.vector_dimension, distance=distance_metric)
	)
	def save_to_qdrant(self, df, collection_name, vector_col, payload_cols, batch_size=100):
	"""
	Saves a DataFrame to Qdrant in batches.

	Args:
	df (pd.DataFrame): DataFrame containing data to save.
	collection_name (str): Name of the collection in Qdrant.
	vector_col (str): Name of the column containing vectors.
	payload_cols (list[str]): List of column names to include as payload.
	batch_size (int): Number of records to process in each batch.
	"""

	for start_idx in range(0, len(df), batch_size):
	end_idx = min(start_idx + batch_size, len(df))
	batch = df.iloc[start_idx:end_idx]
	records = []
	for idx, row in batch.iterrows():
	# Debug print
	print(f"Index: {idx}, Vector Type: {type(row[vector_col])}, First 10 Elements: {row[vector_col][:10]}")
	record = Record(
	id=idx,
	vector=row[vector_col],
	payload={col: row[col] for col in payload_cols}
	)
	records.append(record)
	self.client.upload_records(collection_name=collection_name, records=records)


	def retrieve_specific_records(self, collection_name, ids):
	"""
	Retrieves specific records by their IDs from a Qdrant collection.

	Args:
	collection_name (str): The name of the collection.
	ids (list): List of record IDs to retrieve.

	Returns:
	List of specific records from the collection.
	"""
	return self.client.retrieve(collection_name=collection_name, ids=ids)

	def view_sample_records(self, collection_name, vector_dimension, limit=10):
	"""
	Retrieves a sample of records from a Qdrant collection using a dummy search.

	Args:
	collection_name (str): The name of the collection.
	vector_dimension (int): Dimension of vectors in the collection.
	limit (int): The number of records to retrieve.

	Returns:
	List of sample records from the collection.
	"""
	# Generate a random vector
	random_vector = [uniform(-1, 1) for _ in range(vector_dimension)]

	# Perform a dummy search
	return self.client.search(
	collection_name=collection_name,
	query_vector=random_vector,
	limit=limit
	)
	def match_resumes_to_jobs(self, resume_vector, top_k=10):
	"""
	Matches a given resume vector to job postings.

	Args:
	resume_vector (list): The vector representation of a resume.
	top_k (int): Number of top similar matches to return.

	Returns:
	List of matched job postings with similarity scores.
	"""
	hits = self.client.search(
	collection_name="jobs",
	query_vector=resume_vector,
	limit=top_k,
	with_payload=True
	)
	return [(hit.payload, hit.score) for hit in hits]
	def match_jobs_to_resumes(self, job_vector, top_k=10):
	"""
	Matches a given job vector to resumes.

	Args:
	job_vector (list): The vector representation of a job posting.
	top_k (int): Number of top similar matches to return.

	Returns:
	List of tuples containing matched resumes and their similarity scores.
	"""
	hits = self.client.search(
	collection_name="resumes",
	query_vector=job_vector,
	limit=top_k,
	with_payload=True
	)
	return [(hit.payload, hit.score) for hit in hits]






	def extract_text_from_pdf(file):
	"""
	Extract text from a PDF file using PyPDF2 library.
	"""
	text = ""
	try:
	pdf_reader = PyPDF2.PdfReader(file)
	num_pages = len(pdf_reader.pages)
	for page_num in range(num_pages):
	page = pdf_reader.pages[page_num]
	text += page.extract_text()
	except Exception as e:
	st.error(f"Error extracting text from PDF: {e}")
	return text

	def resume_pdf():
	st.title("UPLOAD RESUMES")

	# Allow user to upload multiple PDF files
	uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type="pdf")


	if uploaded_files:
	st.write("## Extracted Text from PDFs")
	df_rows = []
	# Iterate over uploaded PDF files
	for idx, uploaded_file in enumerate(uploaded_files):
	text = extract_text_from_pdf(uploaded_file)

	# Add text to DataFrame
	df_rows.append({"File Name": f"File_{idx+1}", "Resume": text})

	# # Display extracted text
	# st.write(f"### File {idx+1}")
	# st.write(text)


	# # Iterate over uploaded PDF files
	# for uploaded_file in uploaded_files:
	# text = extract_text_from_pdf(uploaded_file)

	# # Add text to DataFrame
	# df_rows.append({"File Name": uploaded_file.name, "Text": text.decode("utf-8")})

	# # Display extracted text
	# st.write(f"### {uploaded_file.name}")
	# st.write(text)

	# Create DataFrame
	df = pd.DataFrame(df_rows)
	return df

	# # Display DataFrame
	# st.write("## Combined Data in DataFrame")
	# st.write(df)

	def job_desc_pdf():
	st.title("UPLOAD JOB DESCRIPTION")


	uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

	if uploaded_file:
	st.write("## Extracted Text from PDFs")
	text = extract_text_from_pdf(uploaded_file)
	df_rows = []
	df_rows.append({"File Name": "Job_Desc", "description": text})
	# # Iterate over uploaded PDF files
	# for uploaded_file in uploaded_files:
	# text = extract_text_from_pdf(uploaded_file)

	# # Add text to DataFrame
	#

	# # Display extracted text
	# st.write(f"### {uploaded_file.name}")
	# st.write(text)

	# Create DataFrame
	df = pd.DataFrame(df_rows)

	# # Display DataFrame
	# st.write("## Combined Data in DataFrame")
	# st.write(df)
	return df