Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

topic_modelling / funcs /embeddings.py

seanpedrickcase

Debugged reference to random_seed in vectorisation and reference to torch in representation_model.py

8216d8c 6 months ago

raw

history blame contribute delete

5.04 kB

	import time
	import numpy as np
	import os
	import spaces
	from torch import cuda, backends, version
	from sentence_transformers import SentenceTransformer
	from sklearn.pipeline import make_pipeline
	from sklearn.decomposition import TruncatedSVD
	from sklearn.feature_extraction.text import TfidfVectorizer

	# Check for torch cuda
	# If you want to disable cuda for testing purposes
	#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

	print("Is CUDA enabled? ", cuda.is_available())
	print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
	if cuda.is_available():
	torch_device = "gpu"
	print("Cuda version installed is: ", version.cuda)
	high_quality_mode = "Yes"
	os.system("nvidia-smi")
	else:
	torch_device = "cpu"
	high_quality_mode = "No"


	@spaces.GPU
	def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
	"""
	Create or load embeddings for the given documents.

	Args:
	docs (list): List of documents to embed.
	file_list (list): List of file names to check for existing embeddings.
	embeddings_out (np.ndarray): Array to store the embeddings.
	embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
	high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
	random_seed (int): Random seed for vectorisation

	Returns:
	np.ndarray: The generated or loaded embeddings.
	"""

	if high_quality_mode_opt == "Yes":
	# Define a list of possible local locations to search for the model
	local_embeddings_locations = [
	"model/embed/", # Potential local location
	"/model/embed/", # Potential location in Docker container
	"/home/user/app/model/embed/" # This is inside a Docker container
	]

	# Attempt to load the model from each local location
	for location in local_embeddings_locations:
	try:
	embedding_model = SentenceTransformer(location)#, truncate_dim=512)
	print(f"Found local model installation at: {location}")
	break # Exit the loop if the model is found
	except Exception as e:
	print(f"Failed to load model from {location}: {e}")
	continue
	else:
	# If the loop completes without finding the model in any local location
	embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
	print("Could not find local model installation. Downloading from Huggingface")
	else:
	embedding_model = make_pipeline(
	TfidfVectorizer(),
	TruncatedSVD(100, random_state=random_seed)
	)

	# If no embeddings found, make or load in
	if embeddings_out.size == 0:
	print("Embeddings not found. Loading or generating new ones.")

	embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]

	if embeddings_file_names:
	embeddings_file_name = embeddings_file_names[0]
	print("Loading embeddings from file.")
	embeddings_out = np.load(embeddings_file_name)['arr_0']

	# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
	if "compress" in embeddings_file_name:
	embeddings_out /= 100

	if not embeddings_file_names:
	tic = time.perf_counter()
	print("Starting to embed documents.")

	# Custom model
	# If on CPU, don't resort to embedding models
	if high_quality_mode_opt == "No":
	print("Creating simplified 'sparse' embeddings based on TfIDF")

	# Fit the pipeline to the text data
	embedding_model.fit(docs)

	# Transform text data to embeddings
	embeddings_out = embedding_model.transform(docs)

	elif high_quality_mode_opt == "Yes":
	print("Creating dense embeddings based on transformers model")

	# Convert model to half precision (fp16)
	embedding_model.half()
	embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large

	toc = time.perf_counter()
	time_out = f"The embedding took {toc - tic:0.1f} seconds"
	print(time_out)

	# If the user has chosen to go with super compressed embedding files to save disk space
	if embeddings_super_compress == "Yes":
	embeddings_out = np.round(embeddings_out, 3)
	embeddings_out *= 100

	return embeddings_out, embedding_model

	else:
	print("Found pre-loaded embeddings.")

	return embeddings_out, embedding_model