Spaces:

intuitivo
/

mixtral-8x7b-chat

Paused

App Files Files Community

mixtral-8x7b-chat / app.py

joselobenitezg

Update app.py

57eb400 11 months ago

raw

history blame contribute delete

1.94 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
	from threading import Thread
	import subprocess


	try:
	__import__("flash_attn")
	print(f'El paquete ya está instalado.')
	except ModuleNotFoundError:
	subprocess.check_call(["python", '-m', 'pip', 'install', 'flash_attn'])
	print(f'El paquete ha sido instalado.')

	model = AutoModelForCausalLM.from_pretrained("mattshumer/mistral-8x7b-chat", trust_remote_code=True)
	tokenizer = AutoTokenizer.from_pretrained("mattshumer/mistral-8x7b-chat")
	model = model.to('cuda:0')

	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = [29, 0]
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False

	def predict(message, history):

	history_transformer_format = history + [[message, ""]]
	stop = StopOnTokens()

	messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) #curr_system_message +
	for item in history_transformer_format])

	model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=1024,
	do_sample=True,
	top_p=0.95,
	top_k=1000,
	temperature=1.0,
	num_beams=1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = ""
	for new_token in streamer:
	if new_token != '<':
	partial_message += new_token
	yield partial_message


	gr.ChatInterface(predict).queue().launch()