import gradio as gr import copy import random import os import requests import time import sys from huggingface_hub import snapshot_download from llama_cpp import Llama repo_name = "kirp/TinyLlama-1.1B-Chat-v0.2-gguf" model_name = "ggml-model-q4_k_m.gguf" snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name) model = Llama( model_path=model_name, n_ctx=2048, n_parts=1, ) template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" def generate( input=None, temperature=0.1, top_p=0.75, top_k=40, max_new_tokens=512, ): prompt = template.format(input) output = model(prompt, temperature = temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens + len(input), stop=["<|im_end|>"], echo=True) output = output["choices"][0]['text'] return output.split("assistant\n")[1] g = gr.Interface( fn=generate, inputs=[ gr.components.Textbox( lines=2, label="Prompt", placeholder="Tell me about huggingface." ), gr.components.Slider(minimum=0, maximum=1, value=0.7, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.8, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"), gr.components.Slider( minimum=1, maximum=2048, step=1, value=512, label="Max tokens" ), ], outputs=[ gr.Textbox( lines=10, label="Output", ) ], title = "TinyLlama 1.1B Chat GGUF", description = """ original model: [PY007/TinyLlama-1.1B-Chat-v0.2](https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.2) quantized_model: [kirp/TinyLlama-1.1B-Chat-v0.2-gguf](https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf) """ ) g.queue(concurrency_count=2) g.launch()