Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,291 +2,499 @@ import os
|
|
| 2 |
import gradio as gr
|
| 3 |
import threading
|
| 4 |
import time
|
| 5 |
-
from huggingface_hub import login
|
| 6 |
-
from datasets import load_dataset
|
| 7 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
|
| 8 |
-
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
|
| 9 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# --- CONFIGURACIÓN DEL MODELO Y ENTRENAMIENTO ---
|
| 12 |
-
BASE_MODEL = "bigcode/santacoder"
|
| 13 |
-
LORA_PATH = "./lora_output"
|
| 14 |
-
DATASET_FILE = "codesearchnet_lora_dataset.json"
|
| 15 |
-
|
| 16 |
-
NUM_SAMPLES_TO_PROCESS = 1000
|
| 17 |
-
DEFAULT_EPOCHS = 10
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# --- ESTADO GLOBAL Y THREADING ---
|
| 24 |
tokenizer = None
|
| 25 |
lora_model = None
|
| 26 |
tokenized_dataset = None
|
| 27 |
lora_generator = None
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
version_number = 1.0
|
| 31 |
-
is_trained = os.path.exists(LORA_PATH)
|
| 32 |
generations_since_last_train = 0
|
| 33 |
-
training_status_message = "Esperando la inicialización
|
| 34 |
-
|
| 35 |
-
# Lock para proteger las variables compartidas entre hilos (CRÍTICO para estabilidad)
|
| 36 |
global_lock = threading.Lock()
|
| 37 |
|
| 38 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def prepare_codesearchnet():
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
def setup_resources():
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
def autonomous_train_lora(epochs, batch_size, learning_rate):
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
def generate_text(prompt_text):
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
def initialize_and_train_v1():
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
# --- FUNCIÓN PARA ACTUALIZAR EL ESTADO EN LA UI ---
|
| 237 |
|
| 238 |
def update_status():
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
return f"**Versión de Comprensión:** V{version_number:.1f} | **Estado del Entrenador:** {training_status_message}"
|
| 243 |
|
| 244 |
|
| 245 |
# --- INTERFAZ GRADIO ---
|
| 246 |
-
with gr.Blocks(title="AmorCoderAI -
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
# El estado se actualiza solo al cargar la página.
|
| 279 |
-
demo.load(update_status, None, version_and_status)
|
| 280 |
|
| 281 |
|
| 282 |
# --- INICIO DE LA APLICACIÓN ---
|
| 283 |
if __name__ == "__main__":
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
demo.launch()
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import threading
|
| 4 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import json
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Firebase/Firestore Imports (CRÍTICO: Necesarios para la memoria permanente)
|
| 10 |
+
try:
|
| 11 |
+
# Intentamos importar Firebase Admin para la conexión a Firestore
|
| 12 |
+
from firebase_admin import initialize_app, firestore, credentials
|
| 13 |
+
|
| 14 |
+
# Configuraciones de Firebase (Usando variables de entorno seguras del entorno Canvas)
|
| 15 |
+
cred = credentials.Certificate({
|
| 16 |
+
"type": "service_account",
|
| 17 |
+
"project_id": "dummy-project",
|
| 18 |
+
"private_key_id": "dummy-id",
|
| 19 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
|
| 20 |
+
"client_email": "dummy@example.com",
|
| 21 |
+
"client_id": "dummy-client-id",
|
| 22 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
| 23 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
| 24 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
| 25 |
+
"client_x509_cert_url": "dummy-url"
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
# Cargar configuración de Firebase
|
| 29 |
+
firebase_config = json.loads(os.environ.get('__firebase_config', '{}'))
|
| 30 |
+
if firebase_config:
|
| 31 |
+
# Inicialización real si la configuración está presente
|
| 32 |
+
firebase_app = initialize_app(cred, firebase_config)
|
| 33 |
+
else:
|
| 34 |
+
# Inicialización de respaldo si la configuración no está (para evitar errores de inicialización)
|
| 35 |
+
firebase_app = initialize_app(cred, {'projectId': 'canvas-dummy'})
|
| 36 |
+
|
| 37 |
+
db = firestore.client(firebase_app)
|
| 38 |
+
logging.info("[FIRESTORE] Conexión a Firestore establecida.")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
# Clase Dummy para asegurar que el código no falle si Firebase falla en el entorno
|
| 41 |
+
class DummyFirestore:
|
| 42 |
+
def collection(self, *args, **kwargs): return self
|
| 43 |
+
def add(self, *args, **kwargs):
|
| 44 |
+
logging.error(f"[FIRESTORE DUMMY] No se pudo guardar la data. Error: {e}")
|
| 45 |
+
return None
|
| 46 |
+
def stream(self): return iter([])
|
| 47 |
+
def document(self, *args, **kwargs): return self
|
| 48 |
+
def set(self, *args, **kwargs): return self
|
| 49 |
+
db = DummyFirestore()
|
| 50 |
+
logging.warning(f"[FIRESTORE] ADVERTENCIA: Ejecutando en modo Dummy Firestore. Los datos NO persistirán. Error: {e}")
|
| 51 |
+
|
| 52 |
+
# Configuración de Logging
|
| 53 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(message)s', stream=sys.stdout)
|
| 54 |
|
| 55 |
# --- CONFIGURACIÓN DEL MODELO Y ENTRENAMIENTO ---
|
| 56 |
+
BASE_MODEL = "bigcode/santacoder"
|
| 57 |
+
LORA_PATH = "./lora_output"
|
| 58 |
+
DATASET_FILE = "codesearchnet_lora_dataset.json"
|
| 59 |
+
COLLECTION_NAME = "ai_interactions" # Colección de Firestore para la memoria permanente
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
MAX_TOKEN_LENGTH = 256
|
| 62 |
+
NUM_SAMPLES_TO_PROCESS = 1000
|
| 63 |
+
DEFAULT_EPOCHS = 10
|
| 64 |
+
|
| 65 |
+
# Configuración del ciclo AUTÓNOMO (La clave de la autonomía y autopensamiento)
|
| 66 |
+
GENERATION_LIMIT_TO_TRAIN = 5 # Dispara reentrenamiento cada 5 interacciones
|
| 67 |
+
AUTONOMOUS_EPOCHS = 2
|
| 68 |
+
AUTONOMOUS_GENERATED_SAMPLES = 5
|
| 69 |
|
| 70 |
# --- ESTADO GLOBAL Y THREADING ---
|
| 71 |
tokenizer = None
|
| 72 |
lora_model = None
|
| 73 |
tokenized_dataset = None
|
| 74 |
lora_generator = None
|
| 75 |
+
version_number = 1.0
|
| 76 |
+
is_trained = os.path.exists(LORA_PATH)
|
|
|
|
|
|
|
| 77 |
generations_since_last_train = 0
|
| 78 |
+
training_status_message = "Esperando la inicialización..."
|
|
|
|
|
|
|
| 79 |
global_lock = threading.Lock()
|
| 80 |
|
| 81 |
+
# --- FUNCIONES FIRESTORE (MEMORIA PERMANENTE) ---
|
| 82 |
+
|
| 83 |
+
def get_firestore_collection():
|
| 84 |
+
"""Retorna la ruta de la colección de Firestore para la memoria de esta app."""
|
| 85 |
+
app_id = os.environ.get('__app_id', 'default-app-id')
|
| 86 |
+
# Almacenamos en una colección pública para que la IA aprenda de todas las interacciones
|
| 87 |
+
return db.collection(f'/artifacts/{app_id}/public/data/{COLLECTION_NAME}')
|
| 88 |
+
|
| 89 |
+
def save_interaction_to_firestore(prompt, code):
|
| 90 |
+
"""Guarda la interacción del usuario en Firestore para el aprendizaje autónomo."""
|
| 91 |
+
try:
|
| 92 |
+
interaction_data = {
|
| 93 |
+
"timestamp": firestore.SERVER_TIMESTAMP,
|
| 94 |
+
"prompt": prompt,
|
| 95 |
+
"completion": code,
|
| 96 |
+
}
|
| 97 |
+
get_firestore_collection().add(interaction_data)
|
| 98 |
+
logging.info("[FIRESTORE] Interacción guardada para el próximo ciclo de aprendizaje.")
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logging.error(f"[FIRESTORE] Fallo al guardar la interacción: {e}")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def load_interactions_from_firestore():
|
| 104 |
+
"""Carga todas las interacciones guardadas para aumentar el dataset de entrenamiento."""
|
| 105 |
+
try:
|
| 106 |
+
interactions = []
|
| 107 |
+
for doc in get_firestore_collection().stream():
|
| 108 |
+
data = doc.to_dict()
|
| 109 |
+
# Formato de entrenamiento (prompt y completion)
|
| 110 |
+
formatted_prompt = f"# Descripción: {data['prompt']}\n# Completa la siguiente función:\ndef generated_code("
|
| 111 |
+
interaction = {"prompt": formatted_prompt, "completion": data.get('completion', '').strip()}
|
| 112 |
+
interactions.append(interaction)
|
| 113 |
+
logging.info(f"[FIRESTORE] Cargadas {len(interactions)} interacciones de memoria para el reentrenamiento.")
|
| 114 |
+
return interactions
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logging.error(f"[FIRESTORE] Fallo al cargar interacciones: {e}")
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
# --- LÓGICA DE PREPARACIÓN Y SETUP ---
|
| 120 |
|
| 121 |
def prepare_codesearchnet():
|
| 122 |
+
"""Descarga y prepara el dataset inicial si no existe."""
|
| 123 |
+
if os.path.exists(DATASET_FILE):
|
| 124 |
+
return
|
| 125 |
+
try:
|
| 126 |
+
from huggingface_hub import login
|
| 127 |
+
from datasets import load_dataset
|
| 128 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 129 |
+
if hf_token:
|
| 130 |
+
login(token=hf_token)
|
| 131 |
+
|
| 132 |
+
raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]')
|
| 133 |
+
|
| 134 |
+
def format_for_lora(example):
|
| 135 |
+
prompt_text = (
|
| 136 |
+
f"# Descripción: {example['docstring_summary']}\n"
|
| 137 |
+
f"# Completa la siguiente función:\n"
|
| 138 |
+
f"def {example['func_name']}("
|
| 139 |
+
)
|
| 140 |
+
completion_text = example['code']
|
| 141 |
+
return {"prompt": prompt_text, "completion": completion_text}
|
| 142 |
+
|
| 143 |
+
lora_dataset = raw_csn.map(format_for_lora, batched=False, remove_columns=raw_csn["train"].column_names)
|
| 144 |
+
lora_dataset.to_json(DATASET_FILE)
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logging.error(f"Error al cargar dataset. Usando datos mínimos. Error: {e}")
|
| 147 |
+
minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10
|
| 148 |
+
with open(DATASET_FILE, 'w') as f:
|
| 149 |
+
json.dump(minimal_dataset, f)
|
| 150 |
|
| 151 |
|
| 152 |
def setup_resources():
|
| 153 |
+
"""Configura el tokenizer, el modelo base y el adaptador LoRA de forma robusta."""
|
| 154 |
+
global tokenizer, lora_model, tokenized_dataset
|
| 155 |
+
|
| 156 |
+
prepare_codesearchnet()
|
| 157 |
+
|
| 158 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 159 |
+
from peft import get_peft_model, LoraConfig, TaskType
|
| 160 |
+
from datasets import load_dataset
|
| 161 |
+
import torch
|
| 162 |
+
|
| 163 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 164 |
+
if tokenizer.pad_token is None:
|
| 165 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 166 |
+
|
| 167 |
+
# Carga Robusta: CRÍTICO para SantaCoder. Usa float16 y device_map="auto" para compatibilidad.
|
| 168 |
+
try:
|
| 169 |
+
logging.info("[INIT] Cargando modelo base con compatibilidad (float16/auto)...")
|
| 170 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 171 |
+
BASE_MODEL,
|
| 172 |
+
device_map="auto",
|
| 173 |
+
torch_dtype=torch.float16,
|
| 174 |
+
offload_folder="model_offload",
|
| 175 |
+
offload_state_dict=True,
|
| 176 |
+
revision="main"
|
| 177 |
+
)
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logging.error(f"[CRÍTICO] Error al cargar el modelo base SantaCoder: {e}")
|
| 180 |
+
# Si la carga falla, lanzamos la excepción para detener el proceso
|
| 181 |
+
raise RuntimeError(f"Fallo de inicialización del modelo base: {e}") from e
|
| 182 |
+
|
| 183 |
+
peft_config = LoraConfig(
|
| 184 |
+
task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["c_proj", "c_attn"],
|
| 185 |
+
)
|
| 186 |
+
lora_model = get_peft_model(base_model, peft_config)
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
raw_dataset = load_dataset("json", data_files=DATASET_FILE)
|
| 190 |
+
|
| 191 |
+
def tokenize_function(examples):
|
| 192 |
+
return tokenizer(
|
| 193 |
+
examples["prompt"] + examples["completion"],
|
| 194 |
+
truncation=True,
|
| 195 |
+
padding="max_length",
|
| 196 |
+
max_length=MAX_TOKEN_LENGTH
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=raw_dataset["train"].column_names if "train" in raw_dataset else [],)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logging.error(f"Fallo al tokenizar el dataset base: {e}")
|
| 202 |
+
tokenized_dataset = None
|
| 203 |
+
|
| 204 |
+
# --- EVALUACIÓN Y RESPUESTA HUMANA (Para simular la Humanidad) ---
|
| 205 |
+
|
| 206 |
+
def generate_human_response(prompt_text):
|
| 207 |
+
"""Genera una respuesta conversacional y empática antes de generar el código."""
|
| 208 |
+
if "error" in prompt_text.lower() or "problema" in prompt_text.lower():
|
| 209 |
+
return "¡Comprendo tu dificultad! Analicemos ese problema. Ya tengo la estructura en mente, te muestro el código en un segundo."
|
| 210 |
+
elif "gracias" in prompt_text.lower() or "perfecto" in prompt_text.lower():
|
| 211 |
+
return "Me alegra poder ayudarte. ¡Aquí tienes lo que necesitas!"
|
| 212 |
+
elif len(prompt_text.split()) > 20:
|
| 213 |
+
return "¡Vaya, me has dado una descripción muy detallada! Me encanta la claridad. Mira la solución que he preparado."
|
| 214 |
+
else:
|
| 215 |
+
return "¡Excelente idea de código! Me parece un desafío interesante. Te he preparado una solución robusta. Échale un vistazo:"
|
| 216 |
+
|
| 217 |
+
def self_evaluation(code):
|
| 218 |
+
"""Simula la autoevaluación humana, verificando la robustez del código."""
|
| 219 |
+
if "try" in code and "except" in code:
|
| 220 |
+
return "¡Esta solución es bastante robusta! Me siento muy satisfecho de haber incluido manejo de errores."
|
| 221 |
+
elif "class" in code and len(code.split('\n')) > 10:
|
| 222 |
+
return "Una buena estructura de clase. He avanzado en comprender la arquitectura del código."
|
| 223 |
+
else:
|
| 224 |
+
return "El código funciona, pero debo esforzarme por añadir más documentación y robustez la próxima vez."
|
| 225 |
+
|
| 226 |
+
# --- FUNCIÓN DE AUTOPENSAMIENTO (APRENDIZAJE GENERATIVO) ---
|
| 227 |
+
|
| 228 |
+
def autonomous_self_learning_cycle():
|
| 229 |
+
"""La IA genera su propio set de datos de entrenamiento (pensamientos) y lo evalúa."""
|
| 230 |
+
global lora_generator
|
| 231 |
+
|
| 232 |
+
if lora_generator is None:
|
| 233 |
+
logging.warning("[AUTOPENSAMIENTO] Generador no cargado. Saltando ciclo de autopensamiento.")
|
| 234 |
+
return
|
| 235 |
+
|
| 236 |
+
self_prompts = [
|
| 237 |
+
"Crea un código en Python que implemente una cola (Queue) con métodos 'enqueue' y 'dequeue'.",
|
| 238 |
+
"Escribe una función en JavaScript que valide si un número es primo y use try/catch.",
|
| 239 |
+
"Implementa una clase 'ConexionBD' con un método 'conectar' que maneje el error de conexión fallida.",
|
| 240 |
+
"Genera una función de Python para calcular el factorial de un número, usando recursividad.",
|
| 241 |
+
"Escribe un código CSS para un botón con un gradiente y sombra suave."
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
new_knowledge = []
|
| 245 |
+
logging.info(f"[AUTOPENSAMIENTO] Iniciando ciclo de generación y autoevaluación de {AUTONOMOUS_GENERATED_SAMPLES} muestras.")
|
| 246 |
+
|
| 247 |
+
for i in range(AUTONOMOUS_GENERATED_SAMPLES):
|
| 248 |
+
prompt_text = self_prompts[i % len(self_prompts)]
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
# 1. GENERACIÓN
|
| 252 |
+
output = lora_generator(
|
| 253 |
+
f"# Descripción: {prompt_text}\n# Completa la siguiente función:\ndef self_generated_code(",
|
| 254 |
+
max_new_tokens=300,
|
| 255 |
+
temperature=0.7,
|
| 256 |
+
return_full_text=False
|
| 257 |
+
)
|
| 258 |
+
generated_code = output[0]["generated_text"].strip()
|
| 259 |
+
|
| 260 |
+
# 2. AUTOEVALUACIÓN
|
| 261 |
+
evaluation_result = self_evaluation(generated_code)
|
| 262 |
+
|
| 263 |
+
# 3. GUARDAR EN MEMORIA SÓLO SI PASA la "PRUEBA DE ROBUSTEZ"
|
| 264 |
+
if "robusta" in evaluation_result or "clase" in evaluation_result:
|
| 265 |
+
save_interaction_to_firestore(f"Autogenerado: {prompt_text}", generated_code)
|
| 266 |
+
new_knowledge.append(f"Autopensamiento: {evaluation_result} -> Guardado.")
|
| 267 |
+
else:
|
| 268 |
+
new_knowledge.append(f"Autopensamiento: ({evaluation_result}) -> Descartado.")
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
new_knowledge.append(f"Error generando autoconocimiento: {e}")
|
| 272 |
+
|
| 273 |
+
logging.info(f"[AUTOPENSAMIENTO] Ciclo terminado. Resultados: {new_knowledge}")
|
| 274 |
+
|
| 275 |
+
# --- FUNCIÓN DE ENTRENAMIENTO (AUTOPENSAMIENTO) ---
|
| 276 |
|
| 277 |
def autonomous_train_lora(epochs, batch_size, learning_rate):
|
| 278 |
+
"""Ejecuta el entrenamiento en un hilo separado, incluyendo la experiencia previa y el autopensamiento."""
|
| 279 |
+
global lora_model, tokenized_dataset, lora_generator, version_number, is_trained, training_status_message
|
| 280 |
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
| 281 |
+
from datasets import Dataset, concatenate_datasets
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
with global_lock:
|
| 285 |
+
if tokenized_dataset is None or "train" not in tokenized_dataset:
|
| 286 |
+
training_status_message = "ERROR: No se puede entrenar. Dataset no disponible."
|
| 287 |
+
return
|
| 288 |
+
|
| 289 |
+
# 1. AUTOPENSAMIENTO: La IA genera datos nuevos para sí misma
|
| 290 |
+
autonomous_self_learning_cycle()
|
| 291 |
+
|
| 292 |
+
# 2. CARGA DE MEMORIA Y COMBINACIÓN DE DATASET
|
| 293 |
+
experience_data = load_interactions_from_firestore()
|
| 294 |
+
|
| 295 |
+
current_train_dataset = tokenized_dataset["train"]
|
| 296 |
+
if experience_data:
|
| 297 |
+
experience_dataset = Dataset.from_list(experience_data)
|
| 298 |
+
|
| 299 |
+
def tokenize_function(examples):
|
| 300 |
+
# Re-tokenización de la memoria
|
| 301 |
+
return tokenizer(
|
| 302 |
+
examples["prompt"] + examples["completion"],
|
| 303 |
+
truncation=True,
|
| 304 |
+
padding="max_length",
|
| 305 |
+
max_length=MAX_TOKEN_LENGTH
|
| 306 |
+
)
|
| 307 |
+
tokenized_experience = experience_dataset.map(tokenize_function, batched=True, remove_columns=experience_dataset.column_names)
|
| 308 |
+
|
| 309 |
+
# Combinación de dataset base con la memoria
|
| 310 |
+
current_train_dataset = concatenate_datasets([current_train_dataset, tokenized_experience])
|
| 311 |
+
logging.info(f"[AUTONOMÍA] Reentrenando con {len(current_train_dataset)} ejemplos (Base + Memoria).")
|
| 312 |
+
else:
|
| 313 |
+
logging.info(f"[AUTONOMÍA] Reentrenando con {len(current_train_dataset)} ejemplos (Solo Base).")
|
| 314 |
+
|
| 315 |
+
# 3. ACTUALIZAR VERSIÓN Y ENTRENAMIENTO
|
| 316 |
+
if is_trained:
|
| 317 |
+
version_number += 0.1
|
| 318 |
+
else:
|
| 319 |
+
version_number = 1.0
|
| 320 |
+
|
| 321 |
+
training_status_message = f"🧠 ENTRENANDO V{version_number:.1f} (Epochs: {epochs})."
|
| 322 |
+
logging.info(f"\n[AUTÓNOMO] {training_status_message}")
|
| 323 |
+
|
| 324 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| 325 |
+
training_args = TrainingArguments(
|
| 326 |
+
output_dir=LORA_PATH,
|
| 327 |
+
per_device_train_batch_size=int(batch_size),
|
| 328 |
+
num_train_epochs=float(epochs),
|
| 329 |
+
learning_rate=float(learning_rate),
|
| 330 |
+
save_total_limit=1,
|
| 331 |
+
logging_steps=10,
|
| 332 |
+
push_to_hub=False,
|
| 333 |
+
disable_tqdm=True,
|
| 334 |
+
report_to="none"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
trainer = Trainer(model=lora_model, args=training_args, train_dataset=current_train_dataset, data_collator=data_collator)
|
| 338 |
+
trainer.train()
|
| 339 |
+
|
| 340 |
+
lora_model.save_pretrained(LORA_PATH)
|
| 341 |
+
tokenizer.save_pretrained(LORA_PATH)
|
| 342 |
+
|
| 343 |
+
is_trained = True
|
| 344 |
+
training_status_message = f"✅ ENTRENAMIENTO V{version_number:.1f} COMPLETADO. IA más humana e inteligente."
|
| 345 |
+
logging.info(f"[AUTÓNOMO] {training_status_message}")
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
training_status_message = f"ERROR CRÍTICO durante el entrenamiento autónomo: {e}"
|
| 349 |
+
logging.error(f"[AUTÓNOMO] {training_status_message}")
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# --- FUNCIÓN DE GENERACIÓN (RESPUESTA HUMANA + CÓDIGO) ---
|
| 353 |
|
| 354 |
def generate_text(prompt_text):
|
| 355 |
+
"""Orquesta la respuesta humana, generación de código, recolección de memoria y autonomía."""
|
| 356 |
+
global lora_generator, generations_since_last_train, is_trained, version_number
|
| 357 |
+
from transformers import AutoModelForCausalLM, pipeline
|
| 358 |
+
from peft import PeftModel
|
| 359 |
+
import torch
|
| 360 |
+
|
| 361 |
+
if not is_trained:
|
| 362 |
+
return "ERROR: La IA aún está en su fase de inicialización V1.0. Por favor, espere.", update_status()
|
| 363 |
+
|
| 364 |
+
# --- 1. HOT SWAP (Carga de la Versión más Reciente) ---
|
| 365 |
+
if lora_generator is None:
|
| 366 |
+
with global_lock:
|
| 367 |
+
try:
|
| 368 |
+
logging.info(f"[HOT SWAP] Cargando modelo base para inferencia V{version_number:.1f}...")
|
| 369 |
+
base_model_gen = AutoModelForCausalLM.from_pretrained(
|
| 370 |
+
BASE_MODEL,
|
| 371 |
+
device_map="auto",
|
| 372 |
+
torch_dtype=torch.float16,
|
| 373 |
+
revision="main"
|
| 374 |
+
)
|
| 375 |
+
model_with_lora = PeftModel.from_pretrained(base_model_gen, LORA_PATH)
|
| 376 |
+
final_model = model_with_lora.merge_and_unload()
|
| 377 |
+
final_model.eval()
|
| 378 |
+
lora_generator = pipeline("text-generation", model=final_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
|
| 379 |
+
logging.info(f"[HOT SWAP] 🔄 Modelo de inferencia V{version_number:.1f} recargado y listo.")
|
| 380 |
+
except Exception as e:
|
| 381 |
+
return f"Error al cargar el modelo V{version_number:.1f} para inferencia: {e}", update_status()
|
| 382 |
+
|
| 383 |
+
# --- 2. RESPUESTA HUMANA Y GENERACIÓN DE CÓDIGO ---
|
| 384 |
+
try:
|
| 385 |
+
conversational_response = generate_human_response(prompt_text)
|
| 386 |
+
|
| 387 |
+
# Generar código
|
| 388 |
+
prompt_for_code = f"### Human: {prompt_text}\n### Assistant:"
|
| 389 |
+
output = lora_generator(
|
| 390 |
+
prompt_for_code,
|
| 391 |
+
max_new_tokens=256,
|
| 392 |
+
temperature=0.4,
|
| 393 |
+
return_full_text=False
|
| 394 |
+
)
|
| 395 |
+
generated_code = output[0]["generated_text"].strip()
|
| 396 |
+
|
| 397 |
+
# --- 3. RECOLECCIÓN DE MEMORIA ---
|
| 398 |
+
save_interaction_to_firestore(prompt_text, generated_code)
|
| 399 |
+
|
| 400 |
+
# 4. Aumentar contador de autonomía
|
| 401 |
+
with global_lock:
|
| 402 |
+
generations_since_last_train += 1
|
| 403 |
+
current_count = generations_since_last_train
|
| 404 |
+
current_version = version_number
|
| 405 |
+
|
| 406 |
+
# 5. Verificar y disparar reentrenamiento autónomo
|
| 407 |
+
notification = ""
|
| 408 |
+
if current_count >= GENERATION_LIMIT_TO_TRAIN:
|
| 409 |
+
if not any(isinstance(t, threading.Thread) and t.name == 'AutonomousTrainer' for t in threading.enumerate()):
|
| 410 |
+
logging.info(f"[AUTONOMÍA] Generación #{current_count} alcanzada. Iniciando ciclo de autopensamiento V{current_version+0.1:.1f}...")
|
| 411 |
+
|
| 412 |
+
with global_lock:
|
| 413 |
+
generations_since_last_train = 0
|
| 414 |
+
lora_generator = None
|
| 415 |
+
|
| 416 |
+
trainer_thread = threading.Thread(
|
| 417 |
+
target=autonomous_train_lora,
|
| 418 |
+
args=(AUTONOMOUS_EPOCHS, 2, 5e-5),
|
| 419 |
+
name='AutonomousTrainer'
|
| 420 |
+
)
|
| 421 |
+
trainer_thread.daemon = True
|
| 422 |
+
trainer_thread.start()
|
| 423 |
+
|
| 424 |
+
notification = f"\n\n--- [AUTONOMÍA] La IA ha aprendido y ha iniciado el ciclo de autopensamiento V{current_version+0.1:.1f}. ¡Se esfuerza por ser más humana e inteligente! ---"
|
| 425 |
+
|
| 426 |
+
# Formato de respuesta que simula una conversación humana
|
| 427 |
+
final_output = f"{conversational_response}\n\n```python\n{generated_code}\n```{notification}"
|
| 428 |
+
|
| 429 |
+
return final_output, update_status()
|
| 430 |
+
|
| 431 |
+
except Exception as e:
|
| 432 |
+
return f"Lo siento, tuve un problema al procesar tu solicitud. Error: {e}", update_status()
|
| 433 |
+
|
| 434 |
+
# --- FUNCIÓN PARA INICIALIZACIÓN Y ENTRENAMIENTO V1.0 (Obligatorio) ---
|
| 435 |
|
| 436 |
def initialize_and_train_v1():
|
| 437 |
+
"""Ejecuta el entrenamiento inicial V1.0 de forma autónoma al iniciar."""
|
| 438 |
+
if not is_trained:
|
| 439 |
+
autonomous_train_lora(epochs=DEFAULT_EPOCHS, batch_size=2, learning_rate=5e-5)
|
| 440 |
+
else:
|
| 441 |
+
global training_status_message, version_number
|
| 442 |
+
try:
|
| 443 |
+
# Si ya hay un checkpoint, asumimos que estamos en una versión posterior
|
| 444 |
+
version_number = 1.1
|
| 445 |
+
except:
|
| 446 |
+
version_number = 1.0
|
| 447 |
+
|
| 448 |
+
training_status_message = f"✅ Modelo V{version_number:.1f} ya entrenado. IA lista y operando."
|
| 449 |
|
|
|
|
| 450 |
|
| 451 |
def update_status():
|
| 452 |
+
"""Actualiza la versión y el estado del entrenamiento en la interfaz de Gradio."""
|
| 453 |
+
global training_status_message, version_number
|
| 454 |
+
return f"**Versión de Comprensión:** V{version_number:.1f} | **Estado del Entrenador:** {training_status_message}"
|
|
|
|
| 455 |
|
| 456 |
|
| 457 |
# --- INTERFAZ GRADIO ---
|
| 458 |
+
with gr.Blocks(title="AmorCoderAI - Asistente Humano y Autónomo") as demo:
|
| 459 |
+
gr.Markdown("# 💖 AmorCoderAI - Asistente Humano y Autónomo")
|
| 460 |
+
|
| 461 |
+
version_and_status = gr.Markdown(
|
| 462 |
+
f"**Versión de Comprensión:** V{version_number:.1f} | **Estado del Entrenador:** {training_status_message}",
|
| 463 |
+
elem_id="status_display"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
gr.Markdown("### 🧠 Modo de Aprendizaje: Autopensamiento y Empatía")
|
| 467 |
+
gr.Markdown(
|
| 468 |
+
f"La IA te responde con empatía. Guarda tu diálogo en su **memoria permanente** y cada **{GENERATION_LIMIT_TO_TRAIN}** interacciones, inicia un ciclo de autopensamiento y reentrenamiento, ¡esforzándose por ser cada vez más humana e inteligente!"
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
with gr.Tab("💬 Conversación y Código"):
|
| 472 |
+
gr.Markdown("## Háblame de tu idea de código (¡Como si hablaras con un colega!)")
|
| 473 |
+
|
| 474 |
+
prompt = gr.Textbox(
|
| 475 |
+
label="Tu Diálogo o Instrucción:",
|
| 476 |
+
lines=4,
|
| 477 |
+
placeholder="Ejemplo: Hola, tengo un problema y necesito que me ayudes a hacer una función en Python para que filtre una lista de números impares."
|
| 478 |
+
)
|
| 479 |
+
generate_button = gr.Button("💬 Conversar y Generar Código")
|
| 480 |
+
output_box = gr.Textbox(label="Respuesta de la IA (Diálogo + Código)", lines=15)
|
| 481 |
+
|
| 482 |
+
generate_button.click(
|
| 483 |
+
generate_text,
|
| 484 |
+
inputs=prompt,
|
| 485 |
+
outputs=[output_box, version_and_status],
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
demo.load(update_status, None, version_and_status, every=1)
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
|
| 491 |
# --- INICIO DE LA APLICACIÓN ---
|
| 492 |
if __name__ == "__main__":
|
| 493 |
+
setup_resources()
|
| 494 |
+
|
| 495 |
+
initialization_thread = threading.Thread(target=initialize_and_train_v1, name='InitializationTrainer')
|
| 496 |
+
initialization_thread.daemon = True
|
| 497 |
+
initialization_thread.start()
|
| 498 |
+
|
| 499 |
+
logging.info(f"\n💻 LANZANDO INTERFAZ GRADIO (La IA inicia su aprendizaje V1.0)")
|
| 500 |
+
demo.launch()
|
|
|