Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
from datasets import load_dataset, Dataset, concatenate_datasets | |
import numpy as np | |
from typing import Dict, List | |
import os | |
import json | |
def load_and_prepare_data(): | |
# Load the base customer support dataset | |
base_dataset = load_dataset("Victorano/customer-support-1k") | |
# Load custom car service data | |
car_service_data = [] | |
with open('car_service_data.jsonl', 'r') as f: | |
for line in f: | |
car_service_data.append(json.loads(line)) | |
# Convert car service data to the same format as the base dataset | |
car_service_dataset = Dataset.from_list(car_service_data) | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") | |
# Function to format conversations | |
def format_conversation(example): | |
if 'question' in example and 'answer' in example: | |
# Format for base dataset | |
conversation = f"Customer: {example['question']}\nSupport: {example['answer']}" | |
else: | |
# Format for car service data | |
conversation = f"Customer: {example['customer_query']}\nSupport: {example['support_response']}" | |
return {"text": conversation} | |
# Apply formatting to both datasets | |
formatted_base_dataset = base_dataset.map( | |
format_conversation, | |
remove_columns=base_dataset["train"].column_names | |
) | |
formatted_car_dataset = car_service_dataset.map( | |
format_conversation, | |
remove_columns=car_service_dataset.column_names | |
) | |
# Combine datasets | |
combined_train = concatenate_datasets([formatted_base_dataset["train"], formatted_car_dataset]) | |
combined_test = concatenate_datasets([formatted_base_dataset["test"], formatted_car_dataset]) | |
# Tokenize the dataset | |
def tokenize_function(examples): | |
return tokenizer( | |
examples["text"], | |
padding="max_length", | |
truncation=True, | |
max_length=512, | |
return_tensors="pt" | |
) | |
tokenized_dataset = { | |
"train": combined_train.map( | |
tokenize_function, | |
batched=True, | |
remove_columns=combined_train.column_names | |
), | |
"test": combined_test.map( | |
tokenize_function, | |
batched=True, | |
remove_columns=combined_test.column_names | |
) | |
} | |
return tokenized_dataset, tokenizer | |
def train_model(): | |
# Load and prepare data | |
tokenized_dataset, tokenizer = load_and_prepare_data() | |
# Load model | |
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir="./customer_support_chatbot", | |
num_train_epochs=3, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir="./logs", | |
logging_steps=100, | |
save_strategy="epoch", | |
evaluation_strategy="epoch", | |
load_best_model_at_end=True, | |
push_to_hub=False, | |
) | |
# Initialize trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset["train"], | |
eval_dataset=tokenized_dataset["test"], | |
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), | |
) | |
# Train the model | |
trainer.train() | |
# Save the model and tokenizer | |
model.save_pretrained("./customer_support_chatbot") | |
tokenizer.save_pretrained("./customer_support_chatbot") | |
print("Training completed! Model saved to ./customer_support_chatbot") | |
if __name__ == "__main__": | |
train_model() |