File size: 3,130 Bytes
4977a31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import pandas as pd
import transformers
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import streamlit as st

# Path to the directory where Ollama stores models
model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

# Add a padding token to the tokenizer if it doesn't have one
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, tokenized_inputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_masks = tokenized_inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }

# Prepare dataset function
def prepare_dataset(texts, tokenizer, block_size=128):
    inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True)
    dataset = CustomTextDataset(inputs)
    return dataset

# Load the dataset
file_path = "path/to/it_support_transcript_dataset.csv"
df = pd.read_csv(file_path)

# Filter the dataset based on the given criteria and make a copy
filtered_df = df[
    (df['Resolution Status'] == 'Resolved') &
    (df['Customer Satisfaction (CSAT) Score'] >= 4) &
    (df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
].copy()

# Combine only the interaction notes into a single text for training
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']

# Select the training text
training_texts = filtered_df['training_text'].tolist()

# Create CustomTextDataset for fine-tuning
train_dataset = prepare_dataset(training_texts, tokenizer)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Streamlit app
st.title("IT Support Assistant")

# Create a text generation pipeline
text_gen_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

def generate_response(input_text):
    outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
    response = outputs[0]['generated_text']
    return response

input_text = st.text_input("Enter your IT support query:")

if st.button("Generate Response"):
    response = generate_response(input_text)
    st.write("Response:", response)