Spaces:

macadeliccc
/

polyglot-4x7b-chat

Running on Zero

App Files Files Community

polyglot-4x7b-chat / app.py

macadeliccc

Update app.py

a27b181 verified 5 months ago

raw history blame contribute delete

No virus

4.49 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
	from threading import Thread

	# Lazy loading the model to meet huggingface stateless GPU requirements

	# Defining a custom stopping criteria class for the model's text generation.
	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = [50256, 50295] # IDs of tokens where the generation should stop.
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id: # Checking if the last generated token is a stop token.
	return True
	return False


	# Function to generate model predictions.
	@spaces.GPU
	def predict(message, history):
	torch.set_default_device("cuda")

	# Loading the tokenizer and model from Hugging Face's model hub.
	tokenizer = AutoTokenizer.from_pretrained(
	"macadeliccc/laser-dolphin-mixtral-2x7b-dpo",
	trust_remote_code=True
	)
	model = AutoModelForCausalLM.from_pretrained(
	"macadeliccc/laser-dolphin-mixtral-2x7b-dpo",
	torch_dtype="auto",
	load_in_4bit=True,
	trust_remote_code=True
	)
	history_transformer_format = history + [[message, ""]]
	stop = StopOnTokens()

	# Formatting the input for the model.
	system_prompt = "<\|im_start\|>system\nYou are Polyglot, a helpful, multilingual, AI assistant.<\|im_end\|>"
	messages = system_prompt + "".join(["".join(["\n<\|im_start\|>user\n" + item[0], "<\|im_end\|>\n<\|im_start\|>assistant\n" + item[1]]) for item in history_transformer_format])
	input_ids = tokenizer([messages], return_tensors="pt").to('cuda')
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids,
	streamer=streamer,
	max_new_tokens=512,
	do_sample=True,
	top_p=0.95,
	top_k=50,
	temperature=0.7,
	num_beams=1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start() # Starting the generation in a separate thread.
	partial_message = ""
	for new_token in streamer:
	partial_message += new_token
	if '<\|im_end\|>' in partial_message: # Breaking the loop if the stop token is generated.
	break
	yield partial_message


	# Setting up the Gradio chat interface.
	gr.ChatInterface(predict,
	description="""
	<center><img src="https://huggingface.co/macadeliccc/laser-polyglot-4x7b/resolve/main/polyglot.png" width="33%"></center>\n\n
	Chat with [macadeliccc/laser-polyglot-4x7b](https://huggingface.co/macadeliccc/laser-dolphin-mixtral-2x7b-dpo), the first multilingual Mixture of Experts model.
	This model (24.2B param) produces content in English, Chinese, and Japanese. If you are interested in adding more languages, check out [macadeliccc/Polyglot-8x7b-v0.1](https://huggingface.co/macadeliccc/Polyglot-8x7b-v0.1) Output is considered experimental.\n\n
	❤️ If you like this work, please follow me on [Hugging Face](https://huggingface.co/macadeliccc) and [LinkedIn](https://www.linkedin.com/in/tim-dolan-python-dev/).
	""",
	examples=[
	'Can you demonstrate how to implement a quicksort algorithm in Python? Please provide clear and efficient code.',
	'你能展示如何用Python实现快速排序算法吗？请提供清晰且高效的代码。',
	'Pythonでクイックソートアルゴリズムを実装する方法を示していただけますか？わかりやすく効率的なコードを提供してください。',
	'Could you write a thoughtful short story that incorporates elements of magic and adventure? Please keep it concise yet engaging.',
	'你能写一个包含魔法和冒险元素的深思短篇故事吗？请保持简洁但引人入胜。',
	'魔法と冒険の要素を取り入れた心に残る短編小説を書いていただけますか？簡潔でありながらも魅力的にお願いします。'
	],
	theme=gr.themes.Soft(primary_hue="purple"),
	).launch()