Magro-preview

Sleeping

App Files Files Community

Magro-preview / app.py

Sakalti

Update app.py

b26e890 verified 3 months ago

raw

history blame contribute delete

3.58 kB

	import os
	from threading import Thread
	from typing import Iterator
	import queue

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	DESCRIPTION = """\
	# magro-7b

	日本語ai
	"""

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	model_id = "Sakalti/magro-7B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.float16,
	)
	model.config.sliding_window = 4096
	model.eval()

	@spaces.GPU
	def generate(
	message: str,
	chat_history: list[dict],
	max_new_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	conversation = chat_history + [{"role": "user", "content": message}]

	input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(device)

	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	try:
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)
	except queue.Empty:
	# キューが空になった場合の処理
	gr.Warning("生成プロセスがタイムアウトしました。")
	yield "".join(outputs)

	demo = gr.ChatInterface(
	fn=generate,
	type="messages",
	description=DESCRIPTION,
	css_paths="style.css",
	fill_height=True,
	additional_inputs_accordion=gr.Accordion(label="詳細設定", open=False),
	additional_inputs=[
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.7,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.95,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	),
	],
	stop_btn=None,
	examples=[
	["こんにちは、自己紹介をしてください。"],
	["マシンラーニングについての詩を書いてください。"],
	["c言語は難しいですか？"],
	],
	cache_examples=False,
	)

	if __name__ == "__main__":
	demo.launch()