Karlsen commited on
Commit
4977a31
1 Parent(s): 2318f6f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import transformers
4
+ import torch
5
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
6
+ from torch.utils.data import Dataset
7
+ import streamlit as st
8
+
9
+ # Path to the directory where Ollama stores models
10
+ model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b"
11
+
12
+ # Load the tokenizer and model
13
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
14
+ model = AutoModelForCausalLM.from_pretrained(model_dir)
15
+
16
+ # Add a padding token to the tokenizer if it doesn't have one
17
+ if tokenizer.pad_token is None:
18
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
19
+
20
+ # Custom Dataset class
21
+ class CustomTextDataset(Dataset):
22
+ def __init__(self, tokenized_inputs):
23
+ self.input_ids = tokenized_inputs['input_ids']
24
+ self.attention_masks = tokenized_inputs['attention_mask']
25
+
26
+ def __len__(self):
27
+ return len(self.input_ids)
28
+
29
+ def __getitem__(self, idx):
30
+ return {
31
+ 'input_ids': self.input_ids[idx],
32
+ 'attention_mask': self.attention_masks[idx]
33
+ }
34
+
35
+ # Prepare dataset function
36
+ def prepare_dataset(texts, tokenizer, block_size=128):
37
+ inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True)
38
+ dataset = CustomTextDataset(inputs)
39
+ return dataset
40
+
41
+ # Load the dataset
42
+ file_path = "path/to/it_support_transcript_dataset.csv"
43
+ df = pd.read_csv(file_path)
44
+
45
+ # Filter the dataset based on the given criteria and make a copy
46
+ filtered_df = df[
47
+ (df['Resolution Status'] == 'Resolved') &
48
+ (df['Customer Satisfaction (CSAT) Score'] >= 4) &
49
+ (df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied']))
50
+ ].copy()
51
+
52
+ # Combine only the interaction notes into a single text for training
53
+ filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes']
54
+
55
+ # Select the training text
56
+ training_texts = filtered_df['training_text'].tolist()
57
+
58
+ # Create CustomTextDataset for fine-tuning
59
+ train_dataset = prepare_dataset(training_texts, tokenizer)
60
+
61
+ # Data collator for language modeling
62
+ data_collator = DataCollatorForLanguageModeling(
63
+ tokenizer=tokenizer,
64
+ mlm=False,
65
+ )
66
+
67
+ # Training arguments
68
+ training_args = TrainingArguments(
69
+ output_dir="./results",
70
+ overwrite_output_dir=True,
71
+ num_train_epochs=3,
72
+ per_device_train_batch_size=4,
73
+ save_steps=10_000,
74
+ save_total_limit=2,
75
+ )
76
+
77
+ # Trainer
78
+ trainer = Trainer(
79
+ model=model,
80
+ args=training_args,
81
+ data_collator=data_collator,
82
+ train_dataset=train_dataset,
83
+ )
84
+
85
+ # Fine-tune the model
86
+ trainer.train()
87
+
88
+ # Streamlit app
89
+ st.title("IT Support Assistant")
90
+
91
+ # Create a text generation pipeline
92
+ text_gen_pipeline = transformers.pipeline(
93
+ "text-generation",
94
+ model=model,
95
+ tokenizer=tokenizer
96
+ )
97
+
98
+ def generate_response(input_text):
99
+ outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1)
100
+ response = outputs[0]['generated_text']
101
+ return response
102
+
103
+ input_text = st.text_input("Enter your IT support query:")
104
+
105
+ if st.button("Generate Response"):
106
+ response = generate_response(input_text)
107
+ st.write("Response:", response)