File size: 2,434 Bytes
dbe816b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import gradio as gr
from transformers import AutoTokenizer
from quantizer import load_quantized_model
import torch

title = """# 🙋🏻‍♂️Welcome to 🌟Tonic's 2-Bit Llama2 on GPU-Zero! """
description = """
this model a 2 bit quantized model using [QuIP for all](https://github.com/chu-tianxiang/QuIP-for-all/. You can try out [keyfan/Qwen-72B-Chat-2bit](https://huggingface.co/keyfan/Qwen-72B-Chat-2bit) below or try it locally by cloning or duplicating this space.  Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/StableMed_Chat?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3> 
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community on 👻Discord: [Discord](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [PolyGPT](https://github.com/tonic-ai/polygpt-alpha)
"""
# Load the quantized model
quant_dir = "llama-70b_2bit_quip"
quant_model = load_quantized_model(quant_dir).cuda()
tokenizer = AutoTokenizer.from_pretrained(quant_dir)

def generate_text(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").cuda()
    output_ids = quant_model.generate(input_ids, do_sample=True)[0]
    return tokenizer.decode(output_ids)

with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)
    with gr.Row():
        input_text = gr.Textbox(label="Enter text here", placeholder="Type something...", lines=2)
        submit_button = gr.Button("Generate")
    output_text = gr.Textbox(label="Generated Text", readonly=True)

    submit_button.click(
        fn=generate_text,
        inputs=input_text,
        outputs=output_text
    )

demo.launch()