Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,12 @@
|
|
1 |
import os
|
2 |
import pandas as pd
|
|
|
3 |
import transformers
|
4 |
import torch
|
5 |
-
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
|
6 |
from torch.utils.data import Dataset
|
7 |
-
import streamlit as st
|
8 |
-
|
9 |
-
# Path to the directory where Ollama stores models
|
10 |
-
model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"
|
11 |
|
12 |
-
#
|
13 |
-
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
14 |
-
model = AutoModelForCausalLM.from_pretrained(model_dir)
|
15 |
-
|
16 |
-
# Add a padding token to the tokenizer if it doesn't have one
|
17 |
-
if tokenizer.pad_token is None:
|
18 |
-
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
19 |
-
|
20 |
-
# Custom Dataset class
|
21 |
class CustomTextDataset(Dataset):
|
22 |
def __init__(self, tokenized_inputs):
|
23 |
self.input_ids = tokenized_inputs['input_ids']
|
@@ -38,64 +27,77 @@ def prepare_dataset(texts, tokenizer, block_size=128):
|
|
38 |
dataset = CustomTextDataset(inputs)
|
39 |
return dataset
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# Load the dataset
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
]
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
args=training_args,
|
81 |
-
data_collator=data_collator,
|
82 |
-
train_dataset=train_dataset,
|
83 |
-
)
|
84 |
-
|
85 |
-
# Fine-tune the model
|
86 |
-
trainer.train()
|
87 |
-
|
88 |
-
# Streamlit app
|
89 |
-
st.title("IT Support Assistant")
|
90 |
-
|
91 |
-
# Create a text generation pipeline
|
92 |
-
text_gen_pipeline = transformers.pipeline(
|
93 |
-
"text-generation",
|
94 |
-
model=model,
|
95 |
-
tokenizer=tokenizer
|
96 |
-
)
|
97 |
|
98 |
def generate_response(input_text):
|
|
|
|
|
|
|
|
|
|
|
99 |
outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
|
100 |
response = outputs[0]['generated_text']
|
101 |
return response
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
import transformers
|
5 |
import torch
|
6 |
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM, pipeline
|
7 |
from torch.utils.data import Dataset
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Define the Custom Dataset class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
class CustomTextDataset(Dataset):
|
11 |
def __init__(self, tokenized_inputs):
|
12 |
self.input_ids = tokenized_inputs['input_ids']
|
|
|
27 |
dataset = CustomTextDataset(inputs)
|
28 |
return dataset
|
29 |
|
30 |
+
# Function to fine-tune the model
|
31 |
+
def fine_tune_model(train_dataset, model, tokenizer):
|
32 |
+
data_collator = DataCollatorForLanguageModeling(
|
33 |
+
tokenizer=tokenizer,
|
34 |
+
mlm=False,
|
35 |
+
)
|
36 |
+
|
37 |
+
training_args = TrainingArguments(
|
38 |
+
output_dir="./results",
|
39 |
+
overwrite_output_dir=True,
|
40 |
+
num_train_epochs=3,
|
41 |
+
per_device_train_batch_size=4,
|
42 |
+
save_steps=10_000,
|
43 |
+
save_total_limit=2,
|
44 |
+
)
|
45 |
+
|
46 |
+
trainer = Trainer(
|
47 |
+
model=model,
|
48 |
+
args=training_args,
|
49 |
+
data_collator=data_collator,
|
50 |
+
train_dataset=train_dataset,
|
51 |
+
)
|
52 |
+
|
53 |
+
trainer.train()
|
54 |
+
|
55 |
# Load the dataset
|
56 |
+
@st.cache
|
57 |
+
def load_data(it_support_transcript_dataset.csv):
|
58 |
+
df = pd.read_csv(it_support_transcript_dataset.csv)
|
59 |
+
filtered_df = df[
|
60 |
+
(df['Resolution Status'] == 'Resolved') &
|
61 |
+
(df['Customer Satisfaction (CSAT) Score'] >= 4) &
|
62 |
+
(df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
|
63 |
+
].copy()
|
64 |
+
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
|
65 |
+
training_texts = filtered_df['training_text'].tolist()
|
66 |
+
return training_texts
|
67 |
+
|
68 |
+
# Streamlit UI
|
69 |
+
st.title("IT Support Assistant - Training and Deployment")
|
70 |
+
|
71 |
+
# File upload
|
72 |
+
file_path = st.text_input("it_support_transcript_dataset.csv")
|
73 |
+
train_button = st.button("Train Model")
|
74 |
+
|
75 |
+
if file_path and train_button:
|
76 |
+
with st.spinner("Loading data and training the model..."):
|
77 |
+
training_texts = load_data(file_path)
|
78 |
+
|
79 |
+
# Load the tokenizer and model from Hugging Face
|
80 |
+
model_name = "meta-llama/Meta-Llama-2-7B-chat-hf" # Use the available Llama model
|
81 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
82 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
83 |
+
|
84 |
+
if tokenizer.pad_token is None:
|
85 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
86 |
+
|
87 |
+
train_dataset = prepare_dataset(training_texts, tokenizer)
|
88 |
+
fine_tune_model(train_dataset, model, tokenizer)
|
89 |
+
|
90 |
+
st.success("Model trained successfully!")
|
91 |
+
|
92 |
+
# Interactive interface
|
93 |
+
st.title("IT Support Assistant - Interaction")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
def generate_response(input_text):
|
96 |
+
text_gen_pipeline = pipeline(
|
97 |
+
"text-generation",
|
98 |
+
model=model,
|
99 |
+
tokenizer=tokenizer
|
100 |
+
)
|
101 |
outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
|
102 |
response = outputs[0]['generated_text']
|
103 |
return response
|