Karlsen commited on
Commit
e396e88
1 Parent(s): 341c785

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -69
app.py CHANGED
@@ -1,23 +1,12 @@
1
  import os
2
  import pandas as pd
 
3
  import transformers
4
  import torch
5
- from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
6
  from torch.utils.data import Dataset
7
- import streamlit as st
8
-
9
- # Path to the directory where Ollama stores models
10
- model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"
11
 
12
- # Load the tokenizer and model
13
- tokenizer = AutoTokenizer.from_pretrained(model_dir)
14
- model = AutoModelForCausalLM.from_pretrained(model_dir)
15
-
16
- # Add a padding token to the tokenizer if it doesn't have one
17
- if tokenizer.pad_token is None:
18
- tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19
-
20
- # Custom Dataset class
21
  class CustomTextDataset(Dataset):
22
  def __init__(self, tokenized_inputs):
23
  self.input_ids = tokenized_inputs['input_ids']
@@ -38,64 +27,77 @@ def prepare_dataset(texts, tokenizer, block_size=128):
38
  dataset = CustomTextDataset(inputs)
39
  return dataset
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Load the dataset
42
- file_path = "path/to/it_support_transcript_dataset.csv"
43
- df = pd.read_csv(file_path)
44
-
45
- # Filter the dataset based on the given criteria and make a copy
46
- filtered_df = df[
47
- (df['Resolution Status'] == 'Resolved') &
48
- (df['Customer Satisfaction (CSAT) Score'] >= 4) &
49
- (df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
50
- ].copy()
51
-
52
- # Combine only the interaction notes into a single text for training
53
- filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
54
-
55
- # Select the training text
56
- training_texts = filtered_df['training_text'].tolist()
57
-
58
- # Create CustomTextDataset for fine-tuning
59
- train_dataset = prepare_dataset(training_texts, tokenizer)
60
-
61
- # Data collator for language modeling
62
- data_collator = DataCollatorForLanguageModeling(
63
- tokenizer=tokenizer,
64
- mlm=False,
65
- )
66
-
67
- # Training arguments
68
- training_args = TrainingArguments(
69
- output_dir="./results",
70
- overwrite_output_dir=True,
71
- num_train_epochs=3,
72
- per_device_train_batch_size=4,
73
- save_steps=10_000,
74
- save_total_limit=2,
75
- )
76
-
77
- # Trainer
78
- trainer = Trainer(
79
- model=model,
80
- args=training_args,
81
- data_collator=data_collator,
82
- train_dataset=train_dataset,
83
- )
84
-
85
- # Fine-tune the model
86
- trainer.train()
87
-
88
- # Streamlit app
89
- st.title("IT Support Assistant")
90
-
91
- # Create a text generation pipeline
92
- text_gen_pipeline = transformers.pipeline(
93
- "text-generation",
94
- model=model,
95
- tokenizer=tokenizer
96
- )
97
 
98
  def generate_response(input_text):
 
 
 
 
 
99
  outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
100
  response = outputs[0]['generated_text']
101
  return response
 
1
  import os
2
  import pandas as pd
3
+ import streamlit as st
4
  import transformers
5
  import torch
6
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM, pipeline
7
  from torch.utils.data import Dataset
 
 
 
 
8
 
9
+ # Define the Custom Dataset class
 
 
 
 
 
 
 
 
10
  class CustomTextDataset(Dataset):
11
  def __init__(self, tokenized_inputs):
12
  self.input_ids = tokenized_inputs['input_ids']
 
27
  dataset = CustomTextDataset(inputs)
28
  return dataset
29
 
30
+ # Function to fine-tune the model
31
+ def fine_tune_model(train_dataset, model, tokenizer):
32
+ data_collator = DataCollatorForLanguageModeling(
33
+ tokenizer=tokenizer,
34
+ mlm=False,
35
+ )
36
+
37
+ training_args = TrainingArguments(
38
+ output_dir="./results",
39
+ overwrite_output_dir=True,
40
+ num_train_epochs=3,
41
+ per_device_train_batch_size=4,
42
+ save_steps=10_000,
43
+ save_total_limit=2,
44
+ )
45
+
46
+ trainer = Trainer(
47
+ model=model,
48
+ args=training_args,
49
+ data_collator=data_collator,
50
+ train_dataset=train_dataset,
51
+ )
52
+
53
+ trainer.train()
54
+
55
  # Load the dataset
56
+ @st.cache
57
+ def load_data(it_support_transcript_dataset.csv):
58
+ df = pd.read_csv(it_support_transcript_dataset.csv)
59
+ filtered_df = df[
60
+ (df['Resolution Status'] == 'Resolved') &
61
+ (df['Customer Satisfaction (CSAT) Score'] >= 4) &
62
+ (df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
63
+ ].copy()
64
+ filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
65
+ training_texts = filtered_df['training_text'].tolist()
66
+ return training_texts
67
+
68
+ # Streamlit UI
69
+ st.title("IT Support Assistant - Training and Deployment")
70
+
71
+ # File upload
72
+ file_path = st.text_input("it_support_transcript_dataset.csv")
73
+ train_button = st.button("Train Model")
74
+
75
+ if file_path and train_button:
76
+ with st.spinner("Loading data and training the model..."):
77
+ training_texts = load_data(file_path)
78
+
79
+ # Load the tokenizer and model from Hugging Face
80
+ model_name = "meta-llama/Meta-Llama-2-7B-chat-hf" # Use the available Llama model
81
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
82
+ model = AutoModelForCausalLM.from_pretrained(model_name)
83
+
84
+ if tokenizer.pad_token is None:
85
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
86
+
87
+ train_dataset = prepare_dataset(training_texts, tokenizer)
88
+ fine_tune_model(train_dataset, model, tokenizer)
89
+
90
+ st.success("Model trained successfully!")
91
+
92
+ # Interactive interface
93
+ st.title("IT Support Assistant - Interaction")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def generate_response(input_text):
96
+ text_gen_pipeline = pipeline(
97
+ "text-generation",
98
+ model=model,
99
+ tokenizer=tokenizer
100
+ )
101
  outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
102
  response = outputs[0]['generated_text']
103
  return response