import gradio as gr from transformers import AutoTokenizer from quantizer import load_quantized_model import torch title = """# 🙋🏻‍♂️Welcome to 🌟Tonic's 2-Bit Llama2 on GPU-Zero! """ description = """ this model a 2 bit quantized model using [QuIP for all](https://github.com/chu-tianxiang/QuIP-for-all/. You can try out [keyfan/Qwen-72B-Chat-2bit](https://huggingface.co/keyfan/Qwen-72B-Chat-2bit) below or try it locally by cloning or duplicating this space. Simply click here: Duplicate Space Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community on 👻Discord: [Discord](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [PolyGPT](https://github.com/tonic-ai/polygpt-alpha) """ # Load the quantized model quant_dir = "llama-70b_2bit_quip" quant_model = load_quantized_model(quant_dir).cuda() tokenizer = AutoTokenizer.from_pretrained(quant_dir) def generate_text(input_text): input_ids = tokenizer.encode(input_text, return_tensors="pt").cuda() output_ids = quant_model.generate(input_ids, do_sample=True)[0] return tokenizer.decode(output_ids) with gr.Blocks() as demo: gr.Markdown(title) gr.Markdown(description) with gr.Row(): input_text = gr.Textbox(label="Enter text here", placeholder="Type something...", lines=2) submit_button = gr.Button("Generate") output_text = gr.Textbox(label="Generated Text", readonly=True) submit_button.click( fn=generate_text, inputs=input_text, outputs=output_text ) demo.launch()