Spaces:

xqt
/

Synthetic-Python-Programing-Data-Generator

Sleeping

App Files Files Community

Synthetic-Python-Programing-Data-Generator / LlamaManager.py

xqt

Upload 3 files

5446331 verified 10 months ago

raw

history blame

10.6 kB

	import huggingface_hub
	import re

	class LlamaManager():
	def __init__(self, llama_token = None, verbose = False):
	self.verbose = verbose

	if self.verbose:
	print("LlamaManager::__init__::Initializing LlamaManager")
	self.client = huggingface_hub.InferenceClient(
	"meta-llama/Meta-Llama-3.1-70B-Instruct",
	token=llama_token,
	)
	if self.verbose:
	print("LlamaManager::__init__::Initialized LlamaManager")


	def __get_items_between_tags(self, input_string, tag1, tag2):
	pattern = r'' + tag1 + '(.*?)' + tag2 + ''
	return re.findall(pattern, input_string, re.DOTALL)


	def __preprocss_for_auto_generate_questions_categories(self, available_categories):
	if self.verbose:
	print("LlamaManager::__preprocss_for_auto_generate_questions_categories::Preprocessing")
	out = ""
	for available_category in available_categories:
	out += f"[A]{available_category}[/A]"
	return out


	def __postprocess_for_auto_generate_questions_categories(self, out):
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_categories::Postprocessing")

	out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No content found")
	return []
	out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_categories::No categories found")
	return []
	return out


	def auto_generate_questions_categories(
	self,
	count = 20,
	available_categories = ["Variables"],
	seed = 123,
	temperature = 1.0,
	top_p = 0.9,
	frequency_penalty = 0.0
	):
	available_content_for_assistant = self.__preprocss_for_auto_generate_questions_categories(available_categories)
	if self.verbose:
	print("LlamaManager::auto_generate_questions_categories::Generating questions categories")

	message_content = [
	{"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
	{"role": "user", "content": f"Write me {count} basic topics for python programming"},
	{"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
	]

	out = self.client.chat_completion(
	messages = message_content,
	max_tokens = 1000,
	stream = False,
	seed = seed,
	temperature = temperature,
	top_p = top_p,
	frequency_penalty = frequency_penalty
	)

	categories = self.__postprocess_for_auto_generate_questions_categories(out.choices[0].message.content)
	if self.verbose:
	print("LlamaManager::auto_generate_questions_categories::Generated questions Categories")

	return categories


	def __postprocess_for_auto_generate_shots_for_category(self, out):
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::Postprocessing")

	out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No content found")
	return []
	out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_shots_for_category::No questions found")
	return []
	return out


	def auto_generate_shots_for_category(
	self,
	count,
	category,
	seed = 123,
	temperature = 1.0,
	top_p = 0.9,
	frequency_penalty = 0.0
	):
	if self.verbose:
	print("LlamaManager::auto_generate_shots_for_category::Generating shots for category")

	message_content = [
	{"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
	{"role": "user", "content": f"Write me 2 programming questions on the topic of For Loop in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
	{"role": "assistant", "content": f"""[L]
	- [A]Write a program that takes a positive integer as input and computes the sum of its digits using a for loop.[/A]
	- [A]Write a program that generates a spiral matrix of size NxN, where N is always an odd number. Fill the spiral matrix with consecutive prime numbers in a clockwise spiral pattern, starting from the center of the matrix.[/A]
	"""},
	{"role": "user", "content": f"Write me {count} programming questions on the topic of {category} in Python. The question should be of medium and hard difficulty. The question should involve use of just one function"},
	{"role": "assistant", "content": f"[L]"}
	]

	out = self.client.chat_completion(
	messages = message_content,
	max_tokens = 1000,
	stream = False,
	seed = seed,
	temperature = temperature,
	top_p = top_p,
	frequency_penalty = frequency_penalty
	)

	shots = self.__postprocess_for_auto_generate_shots_for_category(out.choices[0].message.content + "[/L]")
	if self.verbose:
	print(f"LlamaManager::auto_generate_shots_for_category::Generated {count} shots for {category}")

	return shots


	def __preprocess_for_auto_generate_questions_from_shots(self, shots):
	if self.verbose:
	print("LlamaManager::__preprocess_for_auto_generate_questions_from_shots::Preprocessing")
	out = ""
	for shot in shots:
	out += f"[A]{shot}[/A]"
	return out


	def __postprocess_for_auto_generate_questions_from_shots(self, out):
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::Postprocessing")

	out = self.__get_items_between_tags(out, r"\[L\]", r"\[/L\]")[0]
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No content found")
	return []
	out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")
	if not out:
	if self.verbose:
	print("LlamaManager::__postprocess_for_auto_generate_questions_from_shots::No questions found")
	return []
	return out


	def auto_generate_questions_from_shots(
	self,
	count,
	category,
	shots,
	seed = 123,
	temperature = 1.0,
	top_p = 0.9,
	frequency_penalty = 0.0
	):
	available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(shots)
	if self.verbose:
	print("LlamaManager::auto_generate_questions_from_shots::Generating questions from shots")

	message_content = [
	{"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
	{"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
	{"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
	]

	previous_iteration_questions_count = []
	questions = []
	token_count = 1000
	while len(questions) < count:
	out = self.client.chat_completion(
	messages = message_content,
	max_tokens = token_count,
	stream = False,
	seed = seed,
	temperature = temperature,
	top_p = top_p,
	frequency_penalty = frequency_penalty
	)

	questions = self.__postprocess_for_auto_generate_questions_from_shots(out.choices[0].message.content + "[/L]")
	available_content_for_assistant = self.__preprocess_for_auto_generate_questions_from_shots(questions)
	previous_iteration_questions_count.append(len(questions))
	message_content = [
	{"role": "system", "content": "You are a synthetic data generator. You must only answer questions as a list. Each item of the list should be enclosed in [A] and [/A] tags. The list should be enclosed in [L] and [/L] tags."},
	{"role": "user", "content": f"Write me {count} python programming questions which uses {category.lower()}"},
	{"role": "assistant", "content": f"[L]{available_content_for_assistant}"}
	]
	token_count += 500

	if len(previous_iteration_questions_count) > 3:
	if previous_iteration_questions_count[-1] == previous_iteration_questions_count[-2] == previous_iteration_questions_count[-3] == previous_iteration_questions_count[-4]:
	if self.verbose:
	print("LlamaManager::auto_generate_questions_from_shots::Generation could not be completed, stopping API calls")
	break

	if self.verbose:
	print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")

	return questions


	if __name__ == "__main__":
	llama_manager = LlamaManager("nope", True)
	categories = llama_manager.auto_generate_questions_categories(20)
	shots = llama_manager.auto_generate_shots_for_category(2, categories[3])
	questions = llama_manager.auto_generate_questions_from_shots(10, categories[3], shots, temperature = 0.5)