# app.py from awq import AutoAWQForCausalLM from transformers import AutoTokenizer import torch import gradio as gr # Model name from Hugging Face MODEL_NAME = "TheBloke/Mistral-7B-v0.1-AWQ" # Load the model print("🚀 Loading Mistral 7B v0.1 AWQ model...") model = AutoAWQForCausalLM.from_quantized( MODEL_NAME, fuse_layers=True, trust_remote_code=False, safetensors=True ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False) print("✅ Model loaded successfully!") # Text generation function def generate_text(prompt, temperature, max_tokens): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_new_tokens=max_tokens, temperature=temperature, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Clean the output (remove the original prompt from response) if prompt in response: response = response[len(prompt):].strip() return response # Gradio Interface interface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=3, placeholder="Ask Mistral something...", label="Prompt"), gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(50, 1024, value=512, step=10, label="Max Tokens") ], outputs=gr.Textbox(lines=10, label="Response"), title="🧠 Mistral 7B v0.1 AWQ", description="Run the quantized Mistral 7B v0.1 model locally or on Google Colab using Gradio.", theme="default" ) if __name__ == "__main__": interface.launch(share=True)