Spaces:
Sleeping
Sleeping
File size: 3,838 Bytes
0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 4dcbc6c 0680865 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
from typing import Dict, List
import os
import json
def load_and_prepare_data():
# Load the base customer support dataset
base_dataset = load_dataset("Victorano/customer-support-1k")
# Load custom car service data
car_service_data = []
with open('car_service_data.jsonl', 'r') as f:
for line in f:
car_service_data.append(json.loads(line))
# Convert car service data to the same format as the base dataset
car_service_dataset = Dataset.from_list(car_service_data)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
# Function to format conversations
def format_conversation(example):
if 'question' in example and 'answer' in example:
# Format for base dataset
conversation = f"Customer: {example['question']}\nSupport: {example['answer']}"
else:
# Format for car service data
conversation = f"Customer: {example['customer_query']}\nSupport: {example['support_response']}"
return {"text": conversation}
# Apply formatting to both datasets
formatted_base_dataset = base_dataset.map(
format_conversation,
remove_columns=base_dataset["train"].column_names
)
formatted_car_dataset = car_service_dataset.map(
format_conversation,
remove_columns=car_service_dataset.column_names
)
# Combine datasets
combined_train = concatenate_datasets([formatted_base_dataset["train"], formatted_car_dataset])
combined_test = concatenate_datasets([formatted_base_dataset["test"], formatted_car_dataset])
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512,
return_tensors="pt"
)
tokenized_dataset = {
"train": combined_train.map(
tokenize_function,
batched=True,
remove_columns=combined_train.column_names
),
"test": combined_test.map(
tokenize_function,
batched=True,
remove_columns=combined_test.column_names
)
}
return tokenized_dataset, tokenizer
def train_model():
# Load and prepare data
tokenized_dataset, tokenizer = load_and_prepare_data()
# Load model
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
# Define training arguments
training_args = TrainingArguments(
output_dir="./customer_support_chatbot",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
save_strategy="epoch",
evaluation_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)
# Train the model
trainer.train()
# Save the model and tokenizer
model.save_pretrained("./customer_support_chatbot")
tokenizer.save_pretrained("./customer_support_chatbot")
print("Training completed! Model saved to ./customer_support_chatbot")
if __name__ == "__main__":
train_model() |