Assistant / banking_model.py
Really-amin's picture
Upload 7 files
7da6612 verified
import pandas as pd
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import Dataset
import json
from pathlib import Path
class BankingModelTrainer:
def __init__(
self,
base_model_name="meta-llama/Llama-2-13b-chat-hf",
output_dir="./fine_tuned_model",
max_length=512
):
self.base_model_name = base_model_name
self.output_dir = Path(output_dir)
self.max_length = max_length
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# تنظیمات مدل Llama-2
model_config = {
"device_map": "auto",
"torch_dtype": torch.bfloat16,
"low_cpu_mem_usage": True,
"max_memory": {0: "10GB"},
"load_in_8bit": True
}
# تنظیمات اولیه مدل و توکنایزر
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
self.model = AutoModelForCausalLM.from_pretrained(
base_model_name,
**model_config
)
def prepare_data(self, data_path):
# خواندن دیتا از فایل
if data_path.endswith('.csv'):
df = pd.read_csv(data_path)
elif data_path.endswith('.json'):
with open(data_path, 'r', encoding='utf-8') as f:
data = json.load(f)
df = pd.DataFrame(data)
else:
raise ValueError("فرمت فایل باید CSV یا JSON باشد")
# پردازش و آماده‌سازی دیتا
def prepare_examples(examples):
conversations = []
for q, a in zip(examples['question'], examples['answer']):
# فرمت Llama-2 برای مکالمه
conv = f"[INST] {q} [/INST] {a}"
conversations.append(conv)
# توکنایز کردن با تنظیمات Llama-2
encodings = self.tokenizer(
conversations,
truncation=True,
padding=True,
max_length=self.max_length,
return_tensors="pt"
)
return encodings
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(
prepare_examples,
batched=True,
remove_columns=dataset.column_names
)
return tokenized_dataset
def train(self, dataset, epochs=3, batch_size=4):
training_args = TrainingArguments(
output_dir=str(self.output_dir),
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=4,
save_steps=500,
logging_steps=100,
learning_rate=2e-5, # کاهش نرخ یادگیری برای Llama-2
warmup_steps=100,
fp16=True, # فعال کردن fp16 برای Llama-2
save_total_limit=2,
logging_dir=str(self.output_dir / "logs"),
gradient_checkpointing=True # فعال کردن gradient checkpointing
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator
)
trainer.train()
self.model.save_pretrained(self.output_dir)
self.tokenizer.save_pretrained(self.output_dir)
def generate_response(self, prompt):
# فرمت Llama-2 برای پرامپت
formatted_prompt = f"[INST] {prompt} [/INST]"
inputs = self.tokenizer.encode(
formatted_prompt,
return_tensors="pt"
).to(self.device)
outputs = self.model.generate(
inputs,
max_length=self.max_length,
num_return_sequences=1,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
repetition_penalty=1.2 # اضافه کردن جریمه تکرار
)
response = self.tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
# حذف پرامپت از پاسخ
response = response.replace(formatted_prompt, "").strip()
return response
if __name__ == "__main__":
trainer = BankingModelTrainer()
dataset = trainer.prepare_data("banking_qa.json")
trainer.train(dataset)
response = trainer.generate_response("شرایط وام مسکن چیست؟")
print(response)