tinyllama-chat / app.py
kirp's picture
Update app.py
a6dfd28
import gradio as gr
import json
from huggingface_hub import snapshot_download
from llama_cpp import Llama
repo_name = "PY007/TinyLlama-1.1B-Chat-v0.2-GGUF"
model_name = "ggml-model-q4_0.gguf"
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
model = Llama(
model_path=model_name,
n_ctx=1024,
n_parts=1,
)
template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
def generate(
input=None,
temperature=0.1,
top_p=0.75,
top_k=40,
max_tokens=512,
):
prompt = template.format(input)
output = ""
for chunk in model.create_completion(prompt,
temperature = temperature,
top_k = top_k,
top_p = top_p,
max_tokens = max_tokens,
stop=["<|im_end|>"],
echo = False,
stream = True):
output +=chunk["choices"][0]["text"]
yield output
return output
g = gr.Interface(
fn=generate,
inputs=[
gr.components.Textbox(
lines=2, label="Prompt", value = "What is Huggingface?"
),
gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p"),
gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"),
gr.components.Slider(minimum=1, maximum=1024, step=1, value=256, label="Max tokens"),
],
outputs=[
gr.Textbox(
lines=10,
label="Output",
)
],
title = "TinyLlama 1.1B Chat GGUF",
description = """
original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2)
quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf)
"""
)
g.queue(concurrency_count=1)
g.launch()