import gradio as gr import torch from transformers import AutoTokenizer from awq import AutoAWQForCausalLM model_path = "bragour/Camel-7b-chat-awq" model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=True, trust_remote_code=False, safetensors=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False) def respond( message ): formatted_prompt = f"[INST]{message}[/INST]" tokens = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda() # Generate the response from the API result = model.generate( tokens, do_sample=False, max_new_tokens=200 ) response = tokenizer.decode(result[0], skip_special_tokens=True) return response # Define the Gradio interface demo = gr.Interface( fn=respond, inputs="text", outputs=["text"] ) demo.launch(inline=False) if __name__ == "__main__": demo.launch()