Spaces:
Running
Running
import json | |
import os | |
import pandas as pd | |
from datetime import datetime, timezone | |
from typing import Optional, Any | |
from src.display.formatting import styled_error, styled_message, styled_warning | |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, RESULTS_REPO | |
from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub | |
from src.submission.do_exam import run_exam | |
from src.populate import get_leaderboard_df | |
from src.display.utils import BENCHMARK_COLS, COLS | |
REQUESTED_MODELS = None | |
USERS_TO_SUBMISSION_DATES = None | |
def add_new_eval( | |
model: str, | |
base_model: str, | |
revision: str, | |
precision: str, | |
weight_type: str, | |
model_type: str, | |
submit_type: str = "huggingface", | |
openrouter_key: str = None, | |
exam_theme: str = None, | |
progress: Optional[Any] = None, | |
leaderboard_component = None, # Componente Gradio para actualizar el leaderboard | |
): | |
""" | |
A帽ade una nueva evaluaci贸n al sistema. | |
Args: | |
model: Nombre del modelo (HF Hub) o identificador (OpenRouter) | |
base_model: Modelo base para pesos delta/adapter | |
revision: Revisi贸n del modelo (solo para HF Hub) | |
precision: Precisi贸n del modelo | |
weight_type: Tipo de pesos (Original/Delta/Adapter) | |
model_type: Tipo de modelo (pretrained/finetuned/etc) | |
submit_type: "huggingface" u "openrouter" | |
openrouter_key: API key para OpenRouter (requerido si submit_type es "openrouter") | |
exam_theme: Tema espec铆fico para la evaluaci贸n (solo para OpenRouter) | |
progress: Funci贸n para actualizar el progreso (opcional) | |
leaderboard_component: Componente del leaderboard para actualizar (opcional) | |
""" | |
global REQUESTED_MODELS | |
global USERS_TO_SUBMISSION_DATES | |
if not REQUESTED_MODELS: | |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) | |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
if model_type is None or model_type == "": | |
return styled_error("Por favor selecciona un tipo de modelo.") | |
if submit_type == "huggingface": | |
# Validar modelo de Hugging Face | |
if revision == "": | |
revision = "main" | |
if weight_type in ["Delta", "Adapter"]: | |
base_model_on_hub, error, _ = is_model_on_hub( | |
model_name=base_model, | |
revision=revision, | |
token=TOKEN, | |
test_tokenizer=True | |
) | |
if not base_model_on_hub: | |
return styled_error(f'El modelo base "{base_model}" {error}') | |
if weight_type != "Adapter": | |
model_on_hub, error, _ = is_model_on_hub( | |
model_name=model, | |
revision=revision, | |
token=TOKEN, | |
test_tokenizer=True | |
) | |
if not model_on_hub: | |
return styled_error(f'El modelo "{model}" {error}') | |
try: | |
model_info = API.model_info(repo_id=model, revision=revision) | |
license = model_info.cardData.get("license", "Unknown") | |
model_size = get_model_size(model_info=model_info, precision=precision) | |
likes = model_info.likes | |
except Exception: | |
return styled_error("No se pudo obtener la informaci贸n del modelo. Por favor verifica que est茅 correctamente configurado.") | |
# Verificar model card y licencia | |
modelcard_OK, error_msg = check_model_card(model) | |
if not modelcard_OK: | |
return styled_error(error_msg) | |
elif submit_type == "openrouter": | |
if not openrouter_key: | |
return styled_error("Se requiere una API key de OpenRouter.") | |
revision = "openrouter" | |
license = "API Service" | |
model_size = 0 # No podemos determinarlo | |
likes = 0 | |
# Parece correcto, crear la evaluaci贸n | |
print("A帽adiendo nueva evaluaci贸n con OpenRouter") | |
# Mostrar progreso inicial si est谩 disponible | |
if progress is not None: | |
progress(0.01, "Iniciando evaluaci贸n con OpenRouter...") | |
# Crear configuraci贸n para guardar en los resultados | |
eval_entry = { | |
"model": model, | |
"base_model": base_model, | |
"revision": revision, | |
"precision": precision, | |
"weight_type": weight_type, | |
"status": "RUNNING", # Inicialmente running mientras evaluamos | |
"submitted_time": current_time, | |
"model_type": model_type, | |
"submit_type": submit_type, | |
"likes": likes, | |
"params": model_size, | |
"license": license, | |
"private": False, | |
} | |
try: | |
# Ejecutar evaluaci贸n con OpenRouter directamente | |
print(f"Ejecutando examen para {model} con OpenRouter" + | |
(f" (tema: {exam_theme})" if exam_theme else " (todos los temas)")) | |
results = run_exam( | |
model_name=model, | |
openai_api_key=openrouter_key, | |
openrouter_base_url="https://openrouter.ai/api/v1", | |
exam_theme=exam_theme, | |
progress=progress | |
) | |
# Actualizar estado a FINISHED tras completar la evaluaci贸n | |
eval_entry["status"] = "FINISHED" | |
# Guardar los resultados obtenidos | |
if progress is not None: | |
progress(0.91, "Guardando resultados...") | |
model_path = model.replace("/", "_") if "/" in model else model | |
# Crear archivo de resultados | |
results_file = f"{EVAL_RESULTS_PATH}/results_{model_path}_{precision}.json" | |
results_data = { | |
"config": eval_entry, | |
"results": results | |
} | |
# Guardar resultados localmente | |
with open(results_file, "w") as f: | |
json.dump(results_data, f) | |
# Subir resultados al dataset con el tema en el mensaje de commit si est谩 disponible | |
if progress is not None: | |
progress(0.95, "Subiendo resultados al repositorio...") | |
print("Subiendo resultados de evaluaci贸n") | |
commit_message = f"Add {model} evaluation results" | |
if exam_theme: | |
commit_message += f" (tema: {exam_theme})" | |
API.upload_file( | |
path_or_fileobj=results_file, | |
path_in_repo=f"results_{model_path}_{precision}.json", | |
repo_id=RESULTS_REPO, | |
repo_type="dataset", | |
commit_message=commit_message | |
) | |
# Actualizar el leaderboard con los nuevos resultados | |
if progress is not None: | |
progress(0.98, "Actualizando leaderboard...") | |
if leaderboard_component is not None: | |
try: | |
# Obtener los datos actualizados del leaderboard | |
updated_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) | |
# Actualizar el componente del leaderboard | |
leaderboard_component.update(value=updated_df) | |
except Exception as e: | |
print(f"Error al actualizar el leaderboard: {e}") | |
# Finalizar progreso si est谩 disponible | |
if progress is not None: | |
progress(1.0, "隆Evaluaci贸n completada con 茅xito!") | |
return styled_message( | |
f"隆Evaluaci贸n de {model} completada con 茅xito! " + | |
f"Resultados: Puntuaci贸n global: {results['overall']['accuracy']:.2f} " + | |
f"({results['overall']['total_questions']} preguntas evaluadas)" | |
) | |
except Exception as e: | |
# Marcar progreso como error si est谩 disponible | |
if progress is not None: | |
progress(1.0, f"Error: {str(e)}") | |
return styled_error(f"Error al evaluar con OpenRouter: {str(e)}") | |
else: | |
return styled_error("Tipo de submit no v谩lido") | |
# Solo llegamos aqu铆 para modelos de Hugging Face (no OpenRouter) | |
# Parece correcto, crear la evaluaci贸n | |
print("A帽adiendo nueva evaluaci贸n") | |
eval_entry = { | |
"model": model, | |
"base_model": base_model, | |
"revision": revision, | |
"precision": precision, | |
"weight_type": weight_type, | |
"status": "PENDING", | |
"submitted_time": current_time, | |
"model_type": model_type, | |
"submit_type": submit_type, | |
"likes": likes, | |
"params": model_size, | |
"license": license, | |
"private": False, | |
} | |
# Verificar duplicados | |
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS: | |
return styled_warning("Este modelo ya ha sido enviado anteriormente.") | |
# Guardar resultados | |
print("Creando archivo de evaluaci贸n") | |
user_name = model.split("/")[0] if "/" in model else "openrouter" | |
model_path = model.split("/")[1] if "/" in model else model | |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" | |
os.makedirs(OUT_DIR, exist_ok=True) | |
request_file = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json" | |
with open(request_file, "w") as f: | |
json.dump(eval_entry, f) | |
print("Subiendo archivo de evaluaci贸n") | |
API.upload_file( | |
path_or_fileobj=request_file, | |
path_in_repo=f"{user_name}/{model_path}_eval_request_False_{precision}_{weight_type}.json", | |
repo_id=QUEUE_REPO, | |
repo_type="dataset", | |
commit_message=f"Add {model} to eval queue", | |
) | |
# Eliminar archivo local | |
os.remove(request_file) | |
return styled_message( | |
"隆Tu solicitud ha sido enviada! Por favor espera hasta una hora para que el modelo aparezca en la lista PENDING." | |
) | |