Spaces:
Sleeping
Sleeping
File size: 3,130 Bytes
4977a31 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
import pandas as pd
import transformers
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import streamlit as st
# Path to the directory where Ollama stores models
model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)
# Add a padding token to the tokenizer if it doesn't have one
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Custom Dataset class
class CustomTextDataset(Dataset):
def __init__(self, tokenized_inputs):
self.input_ids = tokenized_inputs['input_ids']
self.attention_masks = tokenized_inputs['attention_mask']
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {
'input_ids': self.input_ids[idx],
'attention_mask': self.attention_masks[idx]
}
# Prepare dataset function
def prepare_dataset(texts, tokenizer, block_size=128):
inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True)
dataset = CustomTextDataset(inputs)
return dataset
# Load the dataset
file_path = "path/to/it_support_transcript_dataset.csv"
df = pd.read_csv(file_path)
# Filter the dataset based on the given criteria and make a copy
filtered_df = df[
(df['Resolution Status'] == 'Resolved') &
(df['Customer Satisfaction (CSAT) Score'] >= 4) &
(df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
].copy()
# Combine only the interaction notes into a single text for training
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
# Select the training text
training_texts = filtered_df['training_text'].tolist()
# Create CustomTextDataset for fine-tuning
train_dataset = prepare_dataset(training_texts, tokenizer)
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
# Fine-tune the model
trainer.train()
# Streamlit app
st.title("IT Support Assistant")
# Create a text generation pipeline
text_gen_pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer
)
def generate_response(input_text):
outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
response = outputs[0]['generated_text']
return response
input_text = st.text_input("Enter your IT support query:")
if st.button("Generate Response"):
response = generate_response(input_text)
st.write("Response:", response)
|