Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import transformers
|
4 |
+
import torch
|
5 |
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
|
6 |
+
from torch.utils.data import Dataset
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# Path to the directory where Ollama stores models
|
10 |
+
model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"
|
11 |
+
|
12 |
+
# Load the tokenizer and model
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(model_dir)
|
15 |
+
|
16 |
+
# Add a padding token to the tokenizer if it doesn't have one
|
17 |
+
if tokenizer.pad_token is None:
|
18 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
19 |
+
|
20 |
+
# Custom Dataset class
|
21 |
+
class CustomTextDataset(Dataset):
|
22 |
+
def __init__(self, tokenized_inputs):
|
23 |
+
self.input_ids = tokenized_inputs['input_ids']
|
24 |
+
self.attention_masks = tokenized_inputs['attention_mask']
|
25 |
+
|
26 |
+
def __len__(self):
|
27 |
+
return len(self.input_ids)
|
28 |
+
|
29 |
+
def __getitem__(self, idx):
|
30 |
+
return {
|
31 |
+
'input_ids': self.input_ids[idx],
|
32 |
+
'attention_mask': self.attention_masks[idx]
|
33 |
+
}
|
34 |
+
|
35 |
+
# Prepare dataset function
|
36 |
+
def prepare_dataset(texts, tokenizer, block_size=128):
|
37 |
+
inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True)
|
38 |
+
dataset = CustomTextDataset(inputs)
|
39 |
+
return dataset
|
40 |
+
|
41 |
+
# Load the dataset
|
42 |
+
file_path = "path/to/it_support_transcript_dataset.csv"
|
43 |
+
df = pd.read_csv(file_path)
|
44 |
+
|
45 |
+
# Filter the dataset based on the given criteria and make a copy
|
46 |
+
filtered_df = df[
|
47 |
+
(df['Resolution Status'] == 'Resolved') &
|
48 |
+
(df['Customer Satisfaction (CSAT) Score'] >= 4) &
|
49 |
+
(df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
|
50 |
+
].copy()
|
51 |
+
|
52 |
+
# Combine only the interaction notes into a single text for training
|
53 |
+
filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
|
54 |
+
|
55 |
+
# Select the training text
|
56 |
+
training_texts = filtered_df['training_text'].tolist()
|
57 |
+
|
58 |
+
# Create CustomTextDataset for fine-tuning
|
59 |
+
train_dataset = prepare_dataset(training_texts, tokenizer)
|
60 |
+
|
61 |
+
# Data collator for language modeling
|
62 |
+
data_collator = DataCollatorForLanguageModeling(
|
63 |
+
tokenizer=tokenizer,
|
64 |
+
mlm=False,
|
65 |
+
)
|
66 |
+
|
67 |
+
# Training arguments
|
68 |
+
training_args = TrainingArguments(
|
69 |
+
output_dir="./results",
|
70 |
+
overwrite_output_dir=True,
|
71 |
+
num_train_epochs=3,
|
72 |
+
per_device_train_batch_size=4,
|
73 |
+
save_steps=10_000,
|
74 |
+
save_total_limit=2,
|
75 |
+
)
|
76 |
+
|
77 |
+
# Trainer
|
78 |
+
trainer = Trainer(
|
79 |
+
model=model,
|
80 |
+
args=training_args,
|
81 |
+
data_collator=data_collator,
|
82 |
+
train_dataset=train_dataset,
|
83 |
+
)
|
84 |
+
|
85 |
+
# Fine-tune the model
|
86 |
+
trainer.train()
|
87 |
+
|
88 |
+
# Streamlit app
|
89 |
+
st.title("IT Support Assistant")
|
90 |
+
|
91 |
+
# Create a text generation pipeline
|
92 |
+
text_gen_pipeline = transformers.pipeline(
|
93 |
+
"text-generation",
|
94 |
+
model=model,
|
95 |
+
tokenizer=tokenizer
|
96 |
+
)
|
97 |
+
|
98 |
+
def generate_response(input_text):
|
99 |
+
outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
|
100 |
+
response = outputs[0]['generated_text']
|
101 |
+
return response
|
102 |
+
|
103 |
+
input_text = st.text_input("Enter your IT support query:")
|
104 |
+
|
105 |
+
if st.button("Generate Response"):
|
106 |
+
response = generate_response(input_text)
|
107 |
+
st.write("Response:", response)
|