import pandas as pd import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from datasets import Dataset import json from pathlib import Path class BankingModelTrainer: def __init__( self, base_model_name="meta-llama/Llama-2-13b-chat-hf", output_dir="./fine_tuned_model", max_length=512 ): self.base_model_name = base_model_name self.output_dir = Path(output_dir) self.max_length = max_length self.device = "cuda" if torch.cuda.is_available() else "cpu" # تنظیمات مدل Llama-2 model_config = { "device_map": "auto", "torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True, "max_memory": {0: "10GB"}, "load_in_8bit": True } # تنظیمات اولیه مدل و توکنایزر self.tokenizer = AutoTokenizer.from_pretrained(base_model_name) self.model = AutoModelForCausalLM.from_pretrained( base_model_name, **model_config ) def prepare_data(self, data_path): # خواندن دیتا از فایل if data_path.endswith('.csv'): df = pd.read_csv(data_path) elif data_path.endswith('.json'): with open(data_path, 'r', encoding='utf-8') as f: data = json.load(f) df = pd.DataFrame(data) else: raise ValueError("فرمت فایل باید CSV یا JSON باشد") # پردازش و آماده‌سازی دیتا def prepare_examples(examples): conversations = [] for q, a in zip(examples['question'], examples['answer']): # فرمت Llama-2 برای مکالمه conv = f"[INST] {q} [/INST] {a}" conversations.append(conv) # توکنایز کردن با تنظیمات Llama-2 encodings = self.tokenizer( conversations, truncation=True, padding=True, max_length=self.max_length, return_tensors="pt" ) return encodings dataset = Dataset.from_pandas(df) tokenized_dataset = dataset.map( prepare_examples, batched=True, remove_columns=dataset.column_names ) return tokenized_dataset def train(self, dataset, epochs=3, batch_size=4): training_args = TrainingArguments( output_dir=str(self.output_dir), num_train_epochs=epochs, per_device_train_batch_size=batch_size, gradient_accumulation_steps=4, save_steps=500, logging_steps=100, learning_rate=2e-5, # کاهش نرخ یادگیری برای Llama-2 warmup_steps=100, fp16=True, # فعال کردن fp16 برای Llama-2 save_total_limit=2, logging_dir=str(self.output_dir / "logs"), gradient_checkpointing=True # فعال کردن gradient checkpointing ) data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False ) trainer = Trainer( model=self.model, args=training_args, train_dataset=dataset, data_collator=data_collator ) trainer.train() self.model.save_pretrained(self.output_dir) self.tokenizer.save_pretrained(self.output_dir) def generate_response(self, prompt): # فرمت Llama-2 برای پرامپت formatted_prompt = f"[INST] {prompt} [/INST]" inputs = self.tokenizer.encode( formatted_prompt, return_tensors="pt" ).to(self.device) outputs = self.model.generate( inputs, max_length=self.max_length, num_return_sequences=1, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, repetition_penalty=1.2 # اضافه کردن جریمه تکرار ) response = self.tokenizer.decode( outputs[0], skip_special_tokens=True ) # حذف پرامپت از پاسخ response = response.replace(formatted_prompt, "").strip() return response if __name__ == "__main__": trainer = BankingModelTrainer() dataset = trainer.prepare_data("banking_qa.json") trainer.train(dataset) response = trainer.generate_response("شرایط وام مسکن چیست؟") print(response)