import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling from datasets import load_dataset, Dataset, concatenate_datasets import numpy as np from typing import Dict, List import os import json def load_and_prepare_data(): # Load the base customer support dataset base_dataset = load_dataset("Victorano/customer-support-1k") # Load custom car service data car_service_data = [] with open('car_service_data.jsonl', 'r') as f: for line in f: car_service_data.append(json.loads(line)) # Convert car service data to the same format as the base dataset car_service_dataset = Dataset.from_list(car_service_data) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") # Function to format conversations def format_conversation(example): if 'question' in example and 'answer' in example: # Format for base dataset conversation = f"Customer: {example['question']}\nSupport: {example['answer']}" else: # Format for car service data conversation = f"Customer: {example['customer_query']}\nSupport: {example['support_response']}" return {"text": conversation} # Apply formatting to both datasets formatted_base_dataset = base_dataset.map( format_conversation, remove_columns=base_dataset["train"].column_names ) formatted_car_dataset = car_service_dataset.map( format_conversation, remove_columns=car_service_dataset.column_names ) # Combine datasets combined_train = concatenate_datasets([formatted_base_dataset["train"], formatted_car_dataset]) combined_test = concatenate_datasets([formatted_base_dataset["test"], formatted_car_dataset]) # Tokenize the dataset def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt" ) tokenized_dataset = { "train": combined_train.map( tokenize_function, batched=True, remove_columns=combined_train.column_names ), "test": combined_test.map( tokenize_function, batched=True, remove_columns=combined_test.column_names ) } return tokenized_dataset, tokenizer def train_model(): # Load and prepare data tokenized_dataset, tokenizer = load_and_prepare_data() # Load model model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") # Define training arguments training_args = TrainingArguments( output_dir="./customer_support_chatbot", num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=100, save_strategy="epoch", evaluation_strategy="epoch", load_best_model_at_end=True, push_to_hub=False, ) # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["test"], data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), ) # Train the model trainer.train() # Save the model and tokenizer model.save_pretrained("./customer_support_chatbot") tokenizer.save_pretrained("./customer_support_chatbot") print("Training completed! Model saved to ./customer_support_chatbot") if __name__ == "__main__": train_model()