import os import pandas as pd import transformers import torch from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM from torch.utils.data import Dataset import streamlit as st # Path to the directory where Ollama stores models model_dir = "C:/Users/myuser/.ollama/models/meta-llama-3-8b" # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_dir) model = AutoModelForCausalLM.from_pretrained(model_dir) # Add a padding token to the tokenizer if it doesn't have one if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Custom Dataset class class CustomTextDataset(Dataset): def __init__(self, tokenized_inputs): self.input_ids = tokenized_inputs['input_ids'] self.attention_masks = tokenized_inputs['attention_mask'] def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return { 'input_ids': self.input_ids[idx], 'attention_mask': self.attention_masks[idx] } # Prepare dataset function def prepare_dataset(texts, tokenizer, block_size=128): inputs = tokenizer(texts, return_tensors='pt', max_length=block_size, truncation=True, padding=True) dataset = CustomTextDataset(inputs) return dataset # Load the dataset file_path = "path/to/it_support_transcript_dataset.csv" df = pd.read_csv(file_path) # Filter the dataset based on the given criteria and make a copy filtered_df = df[ (df['Resolution Status'] == 'Resolved') & (df['Customer Satisfaction (CSAT) Score'] >= 4) & (df['Customer Feedback Comments'].isin(['Very satisfied', 'Satisfied'])) ].copy() # Combine only the interaction notes into a single text for training filtered_df.loc[:, 'training_text'] = filtered_df['Interaction Notes'] # Select the training text training_texts = filtered_df['training_text'].tolist() # Create CustomTextDataset for fine-tuning train_dataset = prepare_dataset(training_texts, tokenizer) # Data collator for language modeling data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) # Training arguments training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=4, save_steps=10_000, save_total_limit=2, ) # Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) # Fine-tune the model trainer.train() # Streamlit app st.title("IT Support Assistant") # Create a text generation pipeline text_gen_pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer ) def generate_response(input_text): outputs = text_gen_pipeline(input_text, max_length=150, num_return_sequences=1) response = outputs[0]['generated_text'] return response input_text = st.text_input("Enter your IT support query:") if st.button("Generate Response"): response = generate_response(input_text) st.write("Response:", response)