Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

topic_modelling / funcs /representation_model.py

seanpedrickcase

Can split passages into sentences. Improved embedding, LLM representation models, improved zero shot capabilities

55f0ce3 about 15 hours ago

raw history blame contribute delete

No virus

7.19 kB

	import os
	from bertopic.representation import LlamaCPP
	from llama_cpp import Llama
	from pydantic import BaseModel
	import torch.cuda
	from huggingface_hub import hf_hub_download

	from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, BaseRepresentation
	from funcs.embeddings import torch_device
	from funcs.prompts import phi3_prompt, phi3_start

	chosen_prompt = phi3_prompt #open_hermes_prompt # stablelm_prompt
	chosen_start_tag = phi3_start #open_hermes_start # stablelm_start

	random_seed = 42

	# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
	print("torch device for representation functions:", torch_device)
	if torch_device == "gpu":
	low_resource_mode = "No"
	n_gpu_layers = -1 # i.e. all
	else: # torch_device = "cpu"
	low_resource_mode = "Yes"
	n_gpu_layers = 0

	#print("Running on device:", torch_device)
	n_threads = torch.get_num_threads()
	print("CPU n_threads:", n_threads)

	# Default Model parameters
	temperature: float = 0.1
	top_k: int = 3
	top_p: float = 1
	repeat_penalty: float = 1.1
	last_n_tokens_size: int = 128
	max_tokens: int = 500
	seed: int = random_seed
	reset: bool = True
	stream: bool = False
	n_threads: int = n_threads
	n_batch:int = 256
	n_ctx:int = 8192 #4096. # Set to 8192 just to avoid any exceeded context window issues
	sample:bool = True
	trust_remote_code:bool =True

	class LLamacppInitConfigGpu(BaseModel):
	last_n_tokens_size: int
	seed: int
	n_threads: int
	n_batch: int
	n_ctx: int
	n_gpu_layers: int
	temperature: float
	top_k: int
	top_p: float
	repeat_penalty: float
	max_tokens: int
	reset: bool
	stream: bool
	stop: str
	trust_remote_code:bool

	def update_gpu(self, new_value: int):
	self.n_gpu_layers = new_value

	llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
	seed=seed,
	n_threads=n_threads,
	n_batch=n_batch,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	repeat_penalty=repeat_penalty,
	max_tokens=max_tokens,
	reset=reset,
	stream=stream,
	stop=chosen_start_tag,
	trust_remote_code=trust_remote_code)

	## Create representation model parameters ##
	keybert = KeyBERTInspired(random_state=random_seed)
	mmr = MaximalMarginalRelevance(diversity=0.5)
	base_rep = BaseRepresentation()

	# Find model file
	def find_model_file(hf_model_name: str, hf_model_file: str, search_folder: str, sub_folder: str) -> str:
	"""
	Finds the specified model file within the given search folder and subfolder.

	Args:
	hf_model_name (str): The name of the Hugging Face model.
	hf_model_file (str): The specific file name of the model to find.
	search_folder (str): The base folder to start the search.
	sub_folder (str): The subfolder within the search folder to look into.

	Returns:
	str: The path to the found model file, or None if the file is not found.
	"""

	hf_loc = search_folder #os.environ["HF_HOME"]
	hf_sub_loc = search_folder + sub_folder #os.environ["HF_HOME"]

	if sub_folder == "/hub/":
	hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
	else:
	hf_model_name_path = hf_sub_loc

	def find_file(root_folder, file_name):
	for root, dirs, files in os.walk(root_folder):
	if file_name in files:
	return os.path.join(root, file_name)
	return None

	# Example usage
	folder_path = hf_model_name_path # Replace with your folder path
	file_to_find = hf_model_file # Replace with the file name you're looking for

	print("Searching for model file", hf_model_file, "in:", hf_model_name_path)

	found_file = find_file(folder_path, file_to_find) # os.environ["HF_HOME"]

	return found_file

	def create_representation_model(representation_type: str, llm_config: dict, hf_model_name: str, hf_model_file: str, chosen_start_tag: str, low_resource_mode: bool) -> dict:
	"""
	Creates a representation model based on the specified type and configuration.

	Args:
	representation_type (str): The type of representation model to create (e.g., "LLM", "KeyBERT").
	llm_config (dict): Configuration settings for the LLM model.
	hf_model_name (str): The name of the Hugging Face model.
	hf_model_file (str): The specific file name of the model to find.
	chosen_start_tag (str): The start tag to use for the model.
	low_resource_mode (bool): Whether to enable low resource mode.

	Returns:
	dict: A dictionary containing the created representation model.
	"""

	if representation_type == "LLM":
	print("Generating LLM representation")
	# Use llama.cpp to load in model

	# Check for HF_HOME environment variable and supply a default value if it's not found (typical location for huggingface models)
	base_folder = "model" #"~/.cache/huggingface/hub"
	hf_home_value = os.getenv("HF_HOME", base_folder)

	# Expand the user symbol '~' to the full home directory path
	if "~" in base_folder:
	hf_home_value = os.path.expanduser(hf_home_value)

	# Check if the directory exists, create it if it doesn't
	if not os.path.exists(hf_home_value):
	os.makedirs(hf_home_value)

	print("Searching base folder for model:", hf_home_value)

	found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value, "/rep/")

	if found_file:
	print(f"Model file found in model folder: {found_file}")

	else:
	found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value, "/hub/")

	if not found_file:
	error = "File not found in HF hub directory or in local model file."
	print(error, " Downloading model from hub")

	found_file = hf_hub_download(repo_id=hf_model_name, filename=hf_model_file)#, local_dir=hf_home_value) # cache_dir

	print("Downloaded model from Huggingface Hub to: ", found_file)

	print("Loading representation model with", llm_config.n_gpu_layers, "layers allocated to GPU.")

	#llm_config.n_gpu_layers
	llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx,seed=seed) #**llm_config.model_dump())# rope_freq_scale=0.5,
	#print(llm.n_gpu_layers)
	#print("Chosen prompt:", chosen_prompt)
	llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())

	# All representation models
	representation_model = {
	"LLM": llm_model
	}

	elif representation_type == "KeyBERT":
	print("Generating KeyBERT representation")
	#representation_model = {"mmr": mmr}
	representation_model = {"KeyBERT": keybert}

	elif representation_type == "MMR":
	print("Generating MMR representation")
	representation_model = {"MMR": mmr}

	else:
	print("Generating default representation type")
	representation_model = {"Default":base_rep}

	return representation_model