| |
| """ |
| Script pour créer un modèle HuggingFace fonctionnel à partir de zéro |
| ou adapter un modèle existant |
| """ |
|
|
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForCausalLM, |
| AutoConfig, |
| GPT2LMHeadModel, |
| GPT2Tokenizer |
| ) |
| import torch |
| import json |
|
|
| def create_basic_model(): |
| """Crée un modèle de base fonctionnel""" |
| |
| |
| config = AutoConfig.from_pretrained("gpt2") |
| config.vocab_size = 50257 |
| config.n_positions = 1024 |
| config.n_ctx = 1024 |
| config.n_embd = 768 |
| config.n_layer = 12 |
| config.n_head = 12 |
| |
| |
| model = GPT2LMHeadModel(config) |
| |
| |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| return model, tokenizer, config |
|
|
| def save_complete_model(model, tokenizer, config, save_path="./Tenro_V4.1_complete"): |
| """Sauvegarde complète avec tous les fichiers requis""" |
| |
| |
| model.save_pretrained(save_path) |
| tokenizer.save_pretrained(save_path) |
| |
| |
| generation_config = { |
| "_from_model_config": True, |
| "bos_token_id": tokenizer.bos_token_id or tokenizer.eos_token_id, |
| "eos_token_id": tokenizer.eos_token_id, |
| "max_length": 1024, |
| "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id, |
| "do_sample": True, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "repetition_penalty": 1.1 |
| } |
| |
| with open(f"{save_path}/generation_config.json", "w") as f: |
| json.dump(generation_config, f, indent=2) |
| |
| print(f"Modèle complet sauvegardé dans: {save_path}") |
| print("Fichiers créés:") |
| import os |
| for file in os.listdir(save_path): |
| size = os.path.getsize(f"{save_path}/{file}") |
| print(f" - {file}: {size/1024:.1f} KB") |
|
|
| def upload_to_huggingface(model, tokenizer, repo_name): |
| """Upload vers HuggingFace Hub""" |
| try: |
| |
| model.push_to_hub(repo_name) |
| tokenizer.push_to_hub(repo_name) |
| print(f"Modèle uploadé vers: https://huggingface.co/{repo_name}") |
| except Exception as e: |
| print(f"Erreur upload: {e}") |
| print("Assurez-vous d'être connecté avec: huggingface-cli login") |
|
|
| def load_your_existing_data(file_path): |
| """Charge vos données existantes si possible""" |
| try: |
| |
| if file_path.endswith('.json'): |
| with open(file_path, 'r') as f: |
| data = json.load(f) |
| return data |
| elif file_path.endswith('.bin'): |
| data = torch.load(file_path, map_location='cpu') |
| return data |
| else: |
| with open(file_path, 'r') as f: |
| content = f.read() |
| return content |
| except Exception as e: |
| print(f"Impossible de charger {file_path}: {e}") |
| return None |
|
|
| if __name__ == "__main__": |
| print("🚀 Création d'un modèle HuggingFace complet...") |
| |
| |
| model, tokenizer, config = create_basic_model() |
| |
| |
| your_file = "Tenro_V4.1.1" |
| existing_data = load_your_existing_data(your_file) |
| if existing_data: |
| print("✅ Données existantes chargées") |
| |
| |
| |
| save_complete_model(model, tokenizer, config) |
| |
| |
| print("\n🧪 Test du modèle:") |
| inputs = tokenizer("Hello, I am Tenro", return_tensors="pt") |
| with torch.no_grad(): |
| outputs = model.generate( |
| inputs.input_ids, |
| max_length=50, |
| do_sample=True, |
| temperature=0.7, |
| pad_token_id=tokenizer.eos_token_id |
| ) |
| |
| generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(f"Texte généré: {generated_text}") |
| |
| |
| upload_choice = input("\nVoulez-vous uploader vers HuggingFace? (y/n): ") |
| if upload_choice.lower() == 'y': |
| repo_name = input("Nom du repo (ex: votre_nom/Tenro_V4.1): ") |
| upload_to_huggingface(model, tokenizer, repo_name) |
| |
| print("\n✅ Terminé! Vous avez maintenant un modèle complet et fonctionnel.") |