Spaces:
Running
Running
import pandas as pd | |
import streamlit as st | |
import gradio as gr | |
import transformers | |
import torch | |
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM | |
from torch.utils.data import Dataset | |
from datasets import load_dataset | |
# Define the Custom Dataset class | |
class CustomTextDataset(Dataset): | |
def __init__(self, tokenized_inputs): | |
self.input_ids = tokenized_inputs['input_ids'] | |
self.attention_masks = tokenized_inputs['attention_mask'] | |
def __len__(self): | |
return len(self.input_ids) | |
def __getitem__(self, idx): | |
return { | |
'input_ids': self.input_ids[idx], | |
'attention_mask': self.attention_masks[idx] | |
} | |
# Prepare dataset function | |
def prepare_dataset(texts, tokenizer, block_size=128): | |
inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True) | |
dataset = CustomTextDataset(inputs) | |
return dataset | |
# Function to fine-tune the model | |
def fine_tune_model(train_dataset, model, tokenizer): | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, | |
mlm=False, | |
) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
overwrite_output_dir=True, | |
num_train_epochs=3, | |
per_device_train_batch_size=4, | |
save_steps=10_000, | |
save_total_limit=2, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
) | |
trainer.train() | |
# Load the dataset | |
def load_data(): | |
dataset = load_dataset("it_support_transcript_dataset", split="train") | |
df = pd.DataFrame(dataset) | |
filtered_df = df[ | |
(df['Resolution Status'] == 'Resolved') & | |
(df['Customer Satisfaction (CSAT) Score'] >= 4) & | |
(df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied'])) | |
].copy() | |
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes'] | |
training_texts = filtered_df['training_text'].tolist() | |
return training_texts | |
# Streamlit UI | |
st.title("IT Support Assistant - Training and Deployment") | |
train_button = st.button("Train Model") | |
if train_button: | |
with st.spinner("Loading data and training the model..."): | |
training_texts = load_data() | |
# Load the tokenizer and model from Hugging Face | |
model_name = "meta-llama/Meta-Llama-3-8B" # Use the available Llama model | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
train_dataset = prepare_dataset(training_texts, tokenizer) | |
fine_tune_model(train_dataset, model, tokenizer) | |
st.success("Model trained successfully!") | |
# Interactive interface | |
st.title("IT Support Assistant - Interaction") | |
def generate_response(input_text): | |
interface = gr.load("models/meta-llama/Meta-Llama-3-8B") | |
response = interface(input_text)[0] | |
return response | |
input_text = st.text_input("Enter your IT support query:") | |
if st.button("Generate Response"): | |
response = generate_response(input_text) | |
st.write("Response:", response) | |