from transformers import AutoTokenizer, AutoModelForCausalLM import torch import bitsandbytes as bnb # Charger le modèle quantifié en 8-bit tokenizer = AutoTokenizer.from_pretrained("Hawoly18/llama3.2-3B-Wolof") model = AutoModelForCausalLM.from_pretrained( "Hawoly18/llama3.2-3B-Wolof", load_in_8bit=True, # Utilise la quantification en 8-bit device_map="auto" # Permet l'utilisation automatique des ressources (CPU ici) ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Fonction pour générer des réponses def generate_response(question, max_length=512): input_text = f"Question: {question}\nRéponse:" input_ids = tokenizer.encode(input_text, return_tensors='pt', padding=True, truncation=True) attention_mask = input_ids != tokenizer.pad_token_id with torch.no_grad(): output_ids = model.generate( input_ids, max_length=max_length, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, num_beams=5, no_repeat_ngram_size=2, early_stopping=True ) response = tokenizer.decode(output_ids[0], skip_special_tokens=True) response = response.replace(input_text, "").strip() return response # Interface Gradio import gradio as gr interface = gr.Interface( fn=generate_response, inputs="text", outputs="text", title="Model Q&A Interface", description="Ask a question related to BSE and entrepreneurship!", examples=[["yan jumtukaay ci xaral yi BSE moom mën a dimbali ndax moom mën woyal sama liggéey ci entrepreneur yi"]] ) interface.launch(share=True)