Spaces:

Keeby-smilyai
/

LLM-kitchen

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 19

Commit

6b9f465

verified ·

1 Parent(s): 479bcae

Update backend.py

Browse files

Files changed (1) hide show

backend.py +67 -98

backend.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# backend.py — FINAL WORKING VERSION
 import sqlite3
 import threading
 import time
@@ -14,38 +14,22 @@ import shutil
 DB_PATH = "llm_kitchen.db"
 training_queue = []
-active_run_lock = threading.Lock()
-active_run_id = None
 RUN_TIMEOUT = 48 * 3600  # 48 hours
 MAX_RAM_PER_RUN_GB = 1.5
-# ------------------------------ DATABASE ------------------------------
 def init_db():
     conn = sqlite3.connect(DB_PATH, check_same_thread=False)
     cursor = conn.cursor()
     cursor.executescript("""
-        CREATE TABLE IF NOT EXISTS users (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            hf_token TEXT UNIQUE NOT NULL,
-            created_at DATETIME DEFAULT CURRENT_TIMESTAMP
-        );
-        CREATE TABLE IF NOT EXISTS training_runs (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            user_id INTEGER NOT NULL,
-            arch_type TEXT NOT NULL,
-            num_layers INTEGER NOT NULL,
-            learning_rate REAL NOT NULL,
-            epochs INTEGER NOT NULL,
-            batch_size INTEGER NOT NULL,
-            status TEXT DEFAULT 'queued',
-            logs TEXT DEFAULT '',
-            started_at DATETIME,
-            completed_at DATETIME,
-            FOREIGN KEY (user_id) REFERENCES users(id)
-        );
     """)
-    conn.commit()
     conn.close()
 init_db()
@@ -69,10 +53,7 @@ def create_user(hf_token):
     return user_id
 def create_training_run(user_id, config):
-    _, run_id = db_query("""
-        INSERT INTO training_runs (user_id, arch_type, num_layers, learning_rate, epochs, batch_size)
-        VALUES (?, ?, ?, ?, ?, ?)
-    """, (user_id, config['arch_type'], config['num_layers'], config['learning_rate'], config['epochs'], config['batch_size']))
     return run_id
 def get_user_runs(user_id):
@@ -98,8 +79,7 @@ def log_update(message, run_id):
     if run_id > 0:
         db_query("UPDATE training_runs SET logs = logs || ? || ? WHERE id = ?", ('\n', full_msg, run_id))
-# ------------------------------ AUTH ------------------------------
 def verify_hf_token(token):
     try:
         whoami(token=token)
@@ -111,7 +91,7 @@ def verify_hf_token(token):
     except Exception as e:
         return None, f"Invalid token. Please try again. ({str(e)})"
-# ------------------------------ TRAINING QUEUE ------------------------------
 def ram_available():
     return (psutil.virtual_memory().available / (1024**3)) >= MAX_RAM_PER_RUN_GB
@@ -122,34 +102,53 @@ def queue_training_run(user_id, config):
     return run_id
 def start_training_if_free():
-    global active_run_id
-    with active_run_lock:
-        if active_run_id is not None or not training_queue:
-            return False
-        if not ram_available():
-            print("MemoryWarning: Not enough RAM to start new run.")
-            return False
-        job = training_queue.pop(0)
-        active_run_id = job["run_id"]
-        update_run_status(active_run_id, "running")
-        log_update("🍳 Starting kitchen process...", active_run_id)
-        thread = threading.Thread(target=run_training_job, args=(job,))
-        thread.start()
-        threading.Timer(RUN_TIMEOUT, kill_run_timeout, args=[active_run_id]).start()
-        return True
-def kill_run_timeout(run_id):
-    global active_run_id
-    with active_run_lock:
-        if active_run_id == run_id:
             log_update(f"Run {run_id}: 💥 48-HOUR TIMEOUT REACHED. Terminating.", run_id)
             update_run_status(run_id, "timeout")
-            active_run_id = None
-            start_training_if_free()
-# ------------------------------ CUSTOM MODELS FROM SCRATCH ------------------------------
 class CNNLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, num_layers=4):
         super().__init__()
@@ -165,7 +164,6 @@ class CNNLanguageModel(nn.Module):
         logits = self.fc(x)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 class RNNLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
         super().__init__()
@@ -178,7 +176,6 @@ class RNNLanguageModel(nn.Module):
         logits = self.fc(output)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 class TransformerLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3):
         super().__init__()
@@ -192,13 +189,10 @@ class TransformerLanguageModel(nn.Module):
         logits = self.fc(x)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 def get_model(arch_type, vocab_size, num_layers):
     models = {"cnn": CNNLanguageModel, "rnn": RNNLanguageModel, "transformer": TransformerLanguageModel}
     if arch_type not in models: raise ValueError(f"Unknown arch: {arch_type}")
     return models[arch_type](vocab_size, num_layers=num_layers)
-# ------------------------------ DATASET ------------------------------
 class TextDataset(Dataset):
     def __init__(self, tokenized_data):
         self.data = tokenized_data["input_ids"]
@@ -207,58 +201,41 @@ class TextDataset(Dataset):
     def __getitem__(self, idx):
         return {"input_ids": torch.tensor(self.data[idx]), "labels": torch.tensor(self.data[idx])}
-# ------------------------------ TRAINING JOB ------------------------------
 def run_training_job(job):
-    global active_run_id
     run_id = job["run_id"]
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         log_update(f"🚀 Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer_save_path = f"./runs/{run_id}/tokenizer"
         os.makedirs(tokenizer_save_path, exist_ok=True)
         tokenizer.save_pretrained(tokenizer_save_path)
         log_update(f"💾 Tokenizer saved to {tokenizer_save_path}", run_id)
         model = get_model(job["arch_type"], len(tokenizer), job["num_layers"]).to(device)
         log_update(f"🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
         dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:5000]")
-        # THIS IS THE FIX: Added padding="max_length"
-        tokenized_dataset = dataset.map(
-            lambda ex: tokenizer(
-                [q + " " + a for q, a in zip(ex["message"], ex["answer"])],
-                truncation=True,
-                padding="max_length", # <-- THE FIX IS HERE
-                max_length=128
-            ),
-            batched=True,
-            remove_columns=dataset.column_names
-        )
         train_loader = DataLoader(TextDataset(tokenized_dataset), batch_size=job["batch_size"], shuffle=True)
         optimizer = torch.optim.AdamW(model.parameters(), lr=job["learning_rate"])
         model.train()
         log_update(f"▶️ Starting training for {job['epochs']} epochs...", run_id)
         for epoch in range(job["epochs"]):
             for step, batch in enumerate(train_loader):
                 input_ids = batch["input_ids"].to(device)
                 labels = batch["labels"].to(device)
                 optimizer.zero_grad()
                 outputs = model(input_ids, labels=labels)
                 loss = outputs["loss"]
                 loss.backward()
                 optimizer.step()
                 if step % 50 == 0:
                     log_update(f"Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f}", run_id)
             log_update(f"✅ Epoch {epoch+1} completed.", run_id)
         model_path = f"./runs/{run_id}"
         os.makedirs(model_path, exist_ok=True)
         torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")
@@ -273,49 +250,41 @@ def run_training_job(job):
         log_update(success_message, run_id)
         update_run_status(run_id, "completed")
     finally:
-        with active_run_lock:
-            if active_run_id == run_id: active_run_id = None
         start_training_if_free()
-# ------------------------------ INFERENCE ------------------------------
 def run_inference(run_id, prompt):
     model_path = f"./runs/{run_id}/pytorch_model.bin"
     tokenizer_path = f"./runs/{run_id}/tokenizer"
     if not (os.path.exists(model_path) and os.path.exists(tokenizer_path)):
         return "ModelError: Model or tokenizer files not found."
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     rows, _ = db_query("SELECT arch_type, num_layers FROM training_runs WHERE id = ?", (run_id,))
     if not rows: return "ModelError: Run not found in database."
     arch_type, num_layers = rows[0]
     model = get_model(arch_type, len(tokenizer), num_layers)
     model.load_state_dict(torch.load(model_path, map_location="cpu"))
     model.eval()
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids
     with torch.no_grad():
         outputs = model(input_ids)
         logits = outputs["logits"]
         generated_ids = torch.argmax(logits, dim=-1)
         return f"🧑‍🍳 Model says:\n{tokenizer.decode(generated_ids[0], skip_special_tokens=True)}"
-# ------------------------------ PUBLISH TO HUB ------------------------------
 def publish_run_to_hub(run_id, hf_token, repo_name, user_description=""):
     local_dir = f"./runs/{run_id}/hub_upload"
     shutil.rmtree(local_dir, ignore_errors=True)
     os.makedirs(local_dir, exist_ok=True)
     shutil.copy(f"./runs/{run_id}/pytorch_model.bin", f"{local_dir}/pytorch_model.bin")
     shutil.copytree(f"./runs/{run_id}/tokenizer", f"{local_dir}/tokenizer", dirs_exist_ok=True)
     readme_content = user_description.strip() or f"# Model from LLM Kitchen - Run #{run_id}"
     with open(f"{local_dir}/README.md", "w") as f: f.write(readme_content)
     api = HfApi()
     repo_url = api.create_repo(repo_id=repo_name, token=hf_token, exist_ok=True).repo_id
     api.upload_folder(folder_path=local_dir, repo_id=repo_url, token=hf_token)

+# backend.py — PARALLEL PROCESSING VERSION
 import sqlite3
 import threading
 import time
 DB_PATH = "llm_kitchen.db"
 training_queue = []
+# --- NEW STATE MANAGEMENT FOR PARALLELISM ---
+active_runs = set()   # Stores run_ids of currently running jobs
+active_users = set()  # Stores user_ids of users with a currently running job
+scheduler_lock = threading.Lock() # Protects access to the queue and active sets
+# --- CONSTANTS ---
 RUN_TIMEOUT = 48 * 3600  # 48 hours
 MAX_RAM_PER_RUN_GB = 1.5
+# ------------------------------ DATABASE (No Changes Needed) ------------------------------
 def init_db():
     conn = sqlite3.connect(DB_PATH, check_same_thread=False)
     cursor = conn.cursor()
     cursor.executescript("""
+        CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY AUTOINCREMENT, hf_token TEXT UNIQUE NOT NULL, created_at DATETIME DEFAULT CURRENT_TIMESTAMP);
+        CREATE TABLE IF NOT EXISTS training_runs (id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER NOT NULL, arch_type TEXT NOT NULL, num_layers INTEGER NOT NULL, learning_rate REAL NOT NULL, epochs INTEGER NOT NULL, batch_size INTEGER NOT NULL, status TEXT DEFAULT 'queued', logs TEXT DEFAULT '', started_at DATETIME, completed_at DATETIME, FOREIGN KEY (user_id) REFERENCES users(id));
     """)
     conn.close()
 init_db()
     return user_id
 def create_training_run(user_id, config):
+    _, run_id = db_query("INSERT INTO training_runs (user_id, arch_type, num_layers, learning_rate, epochs, batch_size) VALUES (?, ?, ?, ?, ?, ?)", (user_id, config['arch_type'], config['num_layers'], config['learning_rate'], config['epochs'], config['batch_size']))
     return run_id
 def get_user_runs(user_id):
     if run_id > 0:
         db_query("UPDATE training_runs SET logs = logs || ? || ? WHERE id = ?", ('\n', full_msg, run_id))
+# ------------------------------ AUTH (No Changes Needed) ------------------------------
 def verify_hf_token(token):
     try:
         whoami(token=token)
     except Exception as e:
         return None, f"Invalid token. Please try again. ({str(e)})"
+# ------------------------------ NEW PARALLEL TRAINING QUEUE ------------------------------
 def ram_available():
     return (psutil.virtual_memory().available / (1024**3)) >= MAX_RAM_PER_RUN_GB
     return run_id
 def start_training_if_free():
+    """
+    The new scheduler. Tries to start as many jobs as possible from the queue
+    based on available RAM and the one-run-per-user constraint.
+    """
+    with scheduler_lock:
+        # Iterate through a copy of the queue as we might modify it
+        for job in list(training_queue):
+            # 1. Check for global resource constraint (RAM)
+            if not ram_available():
+                log_update("MemoryWarning: Not enough RAM for new runs. Waiting.", -1)
+                break  # Stop trying to schedule if we're out of RAM
+            # 2. Check for per-user constraint
+            if job["user_id"] in active_users:
+                continue  # Skip this job, user already has a run. Check next job.
+            # --- If we get here, we can start the job ---
+            log_update(f"Scheduler: Starting run #{job['run_id']} for user #{job['user_id']}", -1)
+            # Update state to reflect the new running job
+            active_runs.add(job["run_id"])
+            active_users.add(job["user_id"])
+            training_queue.remove(job)
+            # Update database and start the training thread
+            update_run_status(job["run_id"], "running")
+            log_update("🍳 Starting kitchen process...", job["run_id"])
+            thread = threading.Thread(target=run_training_job, args=(job,))
+            thread.start()
+            threading.Timer(RUN_TIMEOUT, kill_run_timeout, args=[job]).start()
+def kill_run_timeout(job):
+    run_id = job["run_id"]
+    user_id = job["user_id"]
+    with scheduler_lock:
+        if run_id in active_runs:
             log_update(f"Run {run_id}: 💥 48-HOUR TIMEOUT REACHED. Terminating.", run_id)
             update_run_status(run_id, "timeout")
+            # Free up resources
+            active_runs.discard(run_id)
+            active_users.discard(user_id)
+    # Try to schedule a new job now that resources are free
+    start_training_if_free()
+# ------------------------------ MODELS & DATASET (No Changes Needed) -------------------------
+# ... (All model and dataset classes are unchanged) ...
 class CNNLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, num_layers=4):
         super().__init__()
         logits = self.fc(x)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 class RNNLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
         super().__init__()
         logits = self.fc(output)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 class TransformerLanguageModel(nn.Module):
     def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3):
         super().__init__()
         logits = self.fc(x)
         loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1)) if labels is not None else None
         return {"loss": loss, "logits": logits}
 def get_model(arch_type, vocab_size, num_layers):
     models = {"cnn": CNNLanguageModel, "rnn": RNNLanguageModel, "transformer": TransformerLanguageModel}
     if arch_type not in models: raise ValueError(f"Unknown arch: {arch_type}")
     return models[arch_type](vocab_size, num_layers=num_layers)
 class TextDataset(Dataset):
     def __init__(self, tokenized_data):
         self.data = tokenized_data["input_ids"]
     def __getitem__(self, idx):
         return {"input_ids": torch.tensor(self.data[idx]), "labels": torch.tensor(self.data[idx])}
+# ------------------------------ TRAINING JOB (Updated `finally` block) -----------------------
 def run_training_job(job):
     run_id = job["run_id"]
+    user_id = job["user_id"] # Get user_id for state management
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         log_update(f"🚀 Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
+        # (The core training logic remains the same)
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer_save_path = f"./runs/{run_id}/tokenizer"
         os.makedirs(tokenizer_save_path, exist_ok=True)
         tokenizer.save_pretrained(tokenizer_save_path)
         log_update(f"💾 Tokenizer saved to {tokenizer_save_path}", run_id)
         model = get_model(job["arch_type"], len(tokenizer), job["num_layers"]).to(device)
         log_update(f"🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
         dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:5000]")
+        tokenized_dataset = dataset.map(lambda ex: tokenizer([q + " " + a for q, a in zip(ex["message"], ex["answer"])], truncation=True, padding="max_length", max_length=128), batched=True, remove_columns=dataset.column_names)
         train_loader = DataLoader(TextDataset(tokenized_dataset), batch_size=job["batch_size"], shuffle=True)
         optimizer = torch.optim.AdamW(model.parameters(), lr=job["learning_rate"])
         model.train()
         log_update(f"▶️ Starting training for {job['epochs']} epochs...", run_id)
         for epoch in range(job["epochs"]):
             for step, batch in enumerate(train_loader):
                 input_ids = batch["input_ids"].to(device)
                 labels = batch["labels"].to(device)
                 optimizer.zero_grad()
                 outputs = model(input_ids, labels=labels)
                 loss = outputs["loss"]
                 loss.backward()
                 optimizer.step()
                 if step % 50 == 0:
                     log_update(f"Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f}", run_id)
             log_update(f"✅ Epoch {epoch+1} completed.", run_id)
         model_path = f"./runs/{run_id}"
         os.makedirs(model_path, exist_ok=True)
         torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")
         log_update(success_message, run_id)
         update_run_status(run_id, "completed")
     finally:
+        # --- NEW: Free up resources and trigger scheduler ---
+        with scheduler_lock:
+            active_runs.discard(run_id)
+            active_users.discard(user_id)
         start_training_if_free()
+# ------------------------------ INFERENCE & PUBLISH (No Changes Needed) --------------------
+# ... (run_inference and publish_run_to_hub are unchanged) ...
 def run_inference(run_id, prompt):
     model_path = f"./runs/{run_id}/pytorch_model.bin"
     tokenizer_path = f"./runs/{run_id}/tokenizer"
     if not (os.path.exists(model_path) and os.path.exists(tokenizer_path)):
         return "ModelError: Model or tokenizer files not found."
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     rows, _ = db_query("SELECT arch_type, num_layers FROM training_runs WHERE id = ?", (run_id,))
     if not rows: return "ModelError: Run not found in database."
     arch_type, num_layers = rows[0]
     model = get_model(arch_type, len(tokenizer), num_layers)
     model.load_state_dict(torch.load(model_path, map_location="cpu"))
     model.eval()
     inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids
     with torch.no_grad():
         outputs = model(input_ids)
         logits = outputs["logits"]
         generated_ids = torch.argmax(logits, dim=-1)
         return f"🧑‍🍳 Model says:\n{tokenizer.decode(generated_ids[0], skip_special_tokens=True)}"
 def publish_run_to_hub(run_id, hf_token, repo_name, user_description=""):
     local_dir = f"./runs/{run_id}/hub_upload"
     shutil.rmtree(local_dir, ignore_errors=True)
     os.makedirs(local_dir, exist_ok=True)
     shutil.copy(f"./runs/{run_id}/pytorch_model.bin", f"{local_dir}/pytorch_model.bin")
     shutil.copytree(f"./runs/{run_id}/tokenizer", f"{local_dir}/tokenizer", dirs_exist_ok=True)
     readme_content = user_description.strip() or f"# Model from LLM Kitchen - Run #{run_id}"
     with open(f"{local_dir}/README.md", "w") as f: f.write(readme_content)
     api = HfApi()
     repo_url = api.create_repo(repo_id=repo_name, token=hf_token, exist_ok=True).repo_id
     api.upload_folder(folder_path=local_dir, repo_id=repo_url, token=hf_token)