Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
import transformers | |
import torch | |
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM | |
from torch.utils.data import Dataset | |
import streamlit as st | |
# Path to the directory where Ollama stores models | |
model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b" | |
# Load the tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
model = AutoModelForCausalLM.from_pretrained(model_dir) | |
# Add a padding token to the tokenizer if it doesn't have one | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
# Custom Dataset class | |
class CustomTextDataset(Dataset): | |
def __init__(self, tokenized_inputs): | |
self.input_ids = tokenized_inputs['input_ids'] | |
self.attention_masks = tokenized_inputs['attention_mask'] | |
def __len__(self): | |
return len(self.input_ids) | |
def __getitem__(self, idx): | |
return { | |
'input_ids': self.input_ids[idx], | |
'attention_mask': self.attention_masks[idx] | |
} | |
# Prepare dataset function | |
def prepare_dataset(texts, tokenizer, block_size=128): | |
inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True) | |
dataset = CustomTextDataset(inputs) | |
return dataset | |
# Load the dataset | |
file_path = "path/to/it_support_transcript_dataset.csv" | |
df = pd.read_csv(file_path) | |
# Filter the dataset based on the given criteria and make a copy | |
filtered_df = df[ | |
(df['Resolution Status'] == 'Resolved') & | |
(df['Customer Satisfaction (CSAT) Score'] >= 4) & | |
(df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied'])) | |
].copy() | |
# Combine only the interaction notes into a single text for training | |
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes'] | |
# Select the training text | |
training_texts = filtered_df['training_text'].tolist() | |
# Create CustomTextDataset for fine-tuning | |
train_dataset = prepare_dataset(training_texts, tokenizer) | |
# Data collator for language modeling | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, | |
mlm=False, | |
) | |
# Training arguments | |
training_args = TrainingArguments( | |
output_dir="./results", | |
overwrite_output_dir=True, | |
num_train_epochs=3, | |
per_device_train_batch_size=4, | |
save_steps=10_000, | |
save_total_limit=2, | |
) | |
# Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
) | |
# Fine-tune the model | |
trainer.train() | |
# Streamlit app | |
st.title("IT Support Assistant") | |
# Create a text generation pipeline | |
text_gen_pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer | |
) | |
def generate_response(input_text): | |
outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1) | |
response = outputs[0]['generated_text'] | |
return response | |
input_text = st.text_input("Enter your IT support query:") | |
if st.button("Generate Response"): | |
response = generate_response(input_text) | |
st.write("Response:", response) | |