Spaces:

kirp
/

tinyllama-chat

Running

App Files Files Community

tinyllama-chat / app.py

kirp

Update app.py

a6dfd28 almost 2 years ago

raw

history blame contribute delete

1.98 kB

	import gradio as gr
	import json

	from huggingface_hub import snapshot_download
	from llama_cpp import Llama

	repo_name = "PY007/TinyLlama-1.1B-Chat-v0.2-GGUF"
	model_name = "ggml-model-q4_0.gguf"

	snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)

	model = Llama(
	model_path=model_name,
	n_ctx=1024,
	n_parts=1,
	)


	template = "<\|im_start\|>user\n{}<\|im_end\|>\n<\|im_start\|>assistant\n"

	def generate(
	input=None,
	temperature=0.1,
	top_p=0.75,
	top_k=40,
	max_tokens=512,
	):

	prompt = template.format(input)
	output = ""
	for chunk in model.create_completion(prompt,
	temperature = temperature,
	top_k = top_k,
	top_p = top_p,
	max_tokens = max_tokens,
	stop=["<\|im_end\|>"],
	echo = False,
	stream = True):
	output +=chunk["choices"][0]["text"]
	yield output
	return output

	g = gr.Interface(
	fn=generate,
	inputs=[
	gr.components.Textbox(
	lines=2, label="Prompt", value = "What is Huggingface?"
	),
	gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
	gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p"),
	gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"),
	gr.components.Slider(minimum=1, maximum=1024, step=1, value=256, label="Max tokens"),
	],
	outputs=[
	gr.Textbox(
	lines=10,
	label="Output",
	)
	],
	title = "TinyLlama 1.1B Chat GGUF",
	description = """
	original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2)
	quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf)
	"""
	)
	g.queue(concurrency_count=1)
	g.launch()