import streamlit as st import numpy as np import torch import random from transformers import ( GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback # Import TrainerCallback here ) from datasets import Dataset from huggingface_hub import HfApi import plotly.graph_objects as go import time from datetime import datetime import threading # Cyberpunk and Loading Animation Styling def setup_cyberpunk_style(): st.markdown(""" """, unsafe_allow_html=True) # Prepare Dataset Function with Padding Token Fix def prepare_dataset(data, tokenizer, block_size=128): tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length') raw_dataset = Dataset.from_dict({'text': data}) tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text']) tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True) tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) return tokenized_dataset # Training Dashboard Class with Enhanced Display class TrainingDashboard: def __init__(self): self.metrics = { 'current_loss': 0, 'best_loss': float('inf'), 'generation': 0, 'individual': 0, 'start_time': time.time(), 'training_speed': 0 } self.history = [] def update(self, loss, generation, individual): self.metrics['current_loss'] = loss self.metrics['generation'] = generation self.metrics['individual'] = individual if loss < self.metrics['best_loss']: self.metrics['best_loss'] = loss elapsed_time = time.time() - self.metrics['start_time'] self.metrics['training_speed'] = (generation * individual) / elapsed_time self.history.append({'loss': loss, 'timestamp': datetime.now().strftime('%H:%M:%S')}) # Define Model Initialization def initialize_model(model_name="gpt2"): model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token return model, tokenizer # Load Dataset Function with Uploaded File Option def load_dataset(data_source="demo", tokenizer=None, uploaded_file=None): if data_source == "demo": data = ["In the neon-lit streets of Neo-Tokyo, a lone hacker fights against the oppressive megacorporations.", "The rain falls in sheets, washing away the bloodstains from the alleyways.", "She plugs into the matrix, seeking answers to questions that have haunted her for years."] elif uploaded_file is not None: if uploaded_file.name.endswith(".txt"): data = [uploaded_file.read().decode("utf-8")] elif uploaded_file.name.endswith(".csv"): import pandas as pd df = pd.read_csv(uploaded_file) data = df[df.columns[0]].tolist() # assuming first column is text data else: data = ["No file uploaded. Please upload a dataset."] dataset = prepare_dataset(data, tokenizer) return dataset # Train Model Function with Customized Progress Bar def train_model(model, train_dataset, tokenizer, epochs=3, batch_size=4, progress_callback=None): training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, save_steps=10_000, save_total_limit=2, logging_dir="./logs", logging_steps=100, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, callbacks=[ProgressCallback(progress_callback)] ) trainer.train() class ProgressCallback(TrainerCallback): def __init__(self, progress_callback): super().__init__() self.progress_callback = progress_callback def on_epoch_end(self, args, state, control, **kwargs): loss = state.log_history[-1]['loss'] generation = state.global_step // args.gradient_accumulation_steps + 1 individual = args.gradient_accumulation_steps self.progress_callback(loss, generation, individual) # Main App Logic def main(): setup_cyberpunk_style() st.markdown('

Neural Training Hub

', unsafe_allow_html=True) # Initialize model and tokenizer model, tokenizer = initialize_model() # Sidebar Configuration with Additional Options with st.sidebar: st.markdown("### Configuration Panel") # Hugging Face API Token Input hf_token = st.text_input("Enter your Hugging Face Token", type="password") if hf_token: api = HfApi() api.set_access_token(hf_token) st.success("Hugging Face token added successfully!") # Training Parameters training_epochs = st.slider("Training Epochs", min_value=1, max_value=5, value=3) batch_size = st.slider("Batch Size", min_value=2, max_value=8, value=4) model_choice = st.selectbox("Model Selection", ("gpt2", "distilgpt2", "gpt2-medium")) # Dataset Source Selection data_source = st.selectbox("Data Source", ("demo", "uploaded file")) uploaded_file = st.file_uploader("Upload a text file", type=["txt", "csv"]) if data_source == "uploaded file" else None custom_learning_rate = st.slider("Learning Rate", min_value=1e-6, max_value=5e-4, value=3e-5, step=1e-6) # Advanced Settings Toggle advanced_toggle = st.checkbox("Advanced Training Settings") if advanced_toggle: warmup_steps = st.slider("Warmup Steps", min_value=0, max_value=500, value=100) weight_decay = st.slider("Weight Decay", min_value=0.0, max_value=0.1, step=0.01, value=0.01) else: warmup_steps = 100 weight_decay = 0.01 # Load Dataset train_dataset = load_dataset(data_source, tokenizer, uploaded_file=uploaded_file) # Chatbot Interaction if st.checkbox("Enable Chatbot"): user_input = st.text_input("You:", placeholder="Type your message here...") if user_input: inputs = tokenizer(user_input, return_tensors="pt") outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1) response = tokenizer.decode(outputs[0], skip_special_tokens=True) st.write("Bot:", response) # Go Button to Start Training if st.button("Go"): progress_placeholder = st.empty() loading_animation = st.empty() st.markdown("### Model Training Progress") dashboard = TrainingDashboard() def train_progress(loss, generation, individual): progress = (generation + 1) / dashboard.metrics['training_epochs'] * 100 progress_placeholder.markdown(f"""
""", unsafe_allow_html=True) dashboard.update(loss=loss, generation=generation, individual=individual) thread = threading.Thread(target=train_model, args=(model, train_dataset, tokenizer, training_epochs, batch_size, train_progress)) thread.start() loading_animation.markdown("""
""", unsafe_allow_html=True) thread.join() loading_animation.empty() st.success("Training Complete!") st.write("Training Metrics:") st.write(dashboard.metrics) if __name__ == "__main__": main()