import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import spaces import time model_name = "hosseinhimself/ISANG-v1.0-8B" base_model_name = "unsloth/Meta-Llama-3.1-8B" # Load tokenizer globally tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @spaces.GPU def load_model(): try: # Load the base model base_model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True ) # Load the PEFT model model = PeftModel.from_pretrained(base_model, model_name) print(f"Model loaded successfully. Using device: {model.device}") return model except Exception as e: print(f"Error loading model: {e}") raise @spaces.GPU def generate_text(prompt): model = load_model() inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=200, num_return_sequences=1, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response gradio_app = gr.Interface( generate_text, inputs=gr.Textbox(label="Enter your message", lines=3), outputs=gr.Textbox(label="Chatbot Response"), title="ISANG Chatbot", description=f"""This is a simple chatbot powered by the ISANG model. It is fine-tuned from the {base_model_name} model. Enter your message and see how the chatbot responds!""", examples=[ ["سلام، چطوری؟"], ["برام یه داستان تعریف کن"], ["بهترین کتابی که خوندی چی بوده؟"], ["توی اوقات فراغتت چی کار می‌کنی؟"], ["نظرت درباره هوش مصنوعی چیه؟"] ] ) if __name__ == "__main__": gradio_app.launch()