File size: 3,838 Bytes
0680865
 
4dcbc6c
0680865
 
 
4dcbc6c
0680865
 
4dcbc6c
 
 
 
 
 
 
 
 
 
 
0680865
 
 
 
 
 
4dcbc6c
 
 
 
 
 
0680865
 
4dcbc6c
 
 
 
 
 
 
0680865
4dcbc6c
0680865
 
4dcbc6c
 
 
 
0680865
 
 
 
 
 
 
 
 
 
4dcbc6c
 
 
 
 
 
 
 
 
 
 
 
0680865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
from typing import Dict, List
import os
import json

def load_and_prepare_data():
    # Load the base customer support dataset
    base_dataset = load_dataset("Victorano/customer-support-1k")
    
    # Load custom car service data
    car_service_data = []
    with open('car_service_data.jsonl', 'r') as f:
        for line in f:
            car_service_data.append(json.loads(line))
    
    # Convert car service data to the same format as the base dataset
    car_service_dataset = Dataset.from_list(car_service_data)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    
    # Function to format conversations
    def format_conversation(example):
        if 'question' in example and 'answer' in example:
            # Format for base dataset
            conversation = f"Customer: {example['question']}\nSupport: {example['answer']}"
        else:
            # Format for car service data
            conversation = f"Customer: {example['customer_query']}\nSupport: {example['support_response']}"
        return {"text": conversation}
    
    # Apply formatting to both datasets
    formatted_base_dataset = base_dataset.map(
        format_conversation,
        remove_columns=base_dataset["train"].column_names
    )
    
    formatted_car_dataset = car_service_dataset.map(
        format_conversation,
        remove_columns=car_service_dataset.column_names
    )
    
    # Combine datasets
    combined_train = concatenate_datasets([formatted_base_dataset["train"], formatted_car_dataset])
    combined_test = concatenate_datasets([formatted_base_dataset["test"], formatted_car_dataset])
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
    
    tokenized_dataset = {
        "train": combined_train.map(
            tokenize_function,
            batched=True,
            remove_columns=combined_train.column_names
        ),
        "test": combined_test.map(
            tokenize_function,
            batched=True,
            remove_columns=combined_test.column_names
        )
    }
    
    return tokenized_dataset, tokenizer

def train_model():
    # Load and prepare data
    tokenized_dataset, tokenizer = load_and_prepare_data()
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./customer_support_chatbot",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )
    
    # Train the model
    trainer.train()
    
    # Save the model and tokenizer
    model.save_pretrained("./customer_support_chatbot")
    tokenizer.save_pretrained("./customer_support_chatbot")
    
    print("Training completed! Model saved to ./customer_support_chatbot")

if __name__ == "__main__":
    train_model()