import streamlit as st import numpy as np import torch import random from transformers import ( GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback # Import TrainerCallback here ) from datasets import Dataset from huggingface_hub import HfApi import plotly.graph_objects as go import time from datetime import datetime import threading # Cyberpunk and Loading Animation Styling def setup_cyberpunk_style(): st.markdown(""" """, unsafe_allow_html=True) # Prepare Dataset Function with Padding Token Fix def prepare_dataset(data, tokenizer, block_size=128): tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length') raw_dataset = Dataset.from_dict({'text': data}) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text']) tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True) tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) return tokenized_dataset # Training Dashboard Class with Enhanced Display class TrainingDashboard: def __init__(self): self.metrics = { 'current_loss': 0, 'best_loss': float('inf'), 'generation': 0, 'individual': 0, 'start_time': time.time(), 'training_speed': 0 } self.history = [] def update(self, loss, generation, individual): self.metrics['current_loss'] = loss self.metrics['generation'] = generation self.metrics['individual'] = individual if loss < self.metrics['best_loss']: self.metrics['best_loss'] = loss elapsed_time = time.time() - self.metrics['start_time'] self.metrics['training_speed'] = (generation * individual) / elapsed_time self.history.append({'loss': loss, 'timestamp': datetime.now().strftime('%H:%M:%S')}) # Define Model Initialization def initialize_model(model_name="gpt2"): model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token return model, tokenizer # Load Dataset Function with Uploaded File Option def load_dataset(data_source="demo", tokenizer=None, uploaded_file=None): if data_source == "demo": data = ["In the neon-lit streets of Neo-Tokyo, a lone hacker fights against the oppressive megacorporations.", "The rain falls in sheets, washing away the bloodstains from the alleyways.", "She plugs into the matrix, seeking answers to questions that have haunted her for years."] elif uploaded_file is not None: if uploaded_file.name.endswith(".txt"): data = [uploaded_file.read().decode("utf-8")] elif uploaded_file.name.endswith(".csv"): import pandas as pd df = pd.read_csv(uploaded_file) data = df[df.columns[0]].tolist() # assuming first column is text data else: data = ["No file uploaded. Please upload a dataset."] dataset = prepare_dataset(data, tokenizer) return dataset # Train Model Function with Customized Progress Bar def train_model(model, train_dataset, tokenizer, epochs=3, batch_size=4, progress_callback=None): training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, save_steps=10_000, save_total_limit=2, logging_dir="./logs", logging_steps=100, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, callbacks=[ProgressCallback(progress_callback)] ) trainer.train() class ProgressCallback(TrainerCallback): def __init__(self, progress_callback): super().__init__() self.progress_callback = progress_callback def on_epoch_end(self, args, state, control, **kwargs): loss = state.log_history[-1]['loss'] generation = state.global_step // args.gradient_accumulation_steps + 1 individual = args.gradient_accumulation_steps self.progress_callback(loss, generation, individual) # Main App Logic def main(): setup_cyberpunk_style() st.markdown('