eda_trainning_lora / scripts /launch_tensorboard.py
Ademir
Initial clean commit: scripts and config without logs
d4a00b2
#!/usr/bin/env python3
"""
Inicia o servidor TensorBoard sobre o diretorio de logs do treino (Hugging Face Trainer).
Uso tipico apos treino local (o script escolhe logs/ ou results/ com eventos):
python scripts/launch_tensorboard.py
Ou fixar o diretorio:
python scripts/launch_tensorboard.py --logdir ./results
Equivalente a: tensorboard --logdir=... --host 127.0.0.1 --port 6006
"""
from __future__ import annotations
import argparse
import socket
import subprocess
import sys
import threading
import time
from pathlib import Path
def _has_tfevents(root: Path) -> bool:
if not root.is_dir():
return False
for path in root.rglob("*"):
if path.is_file() and path.name.startswith("events.out.tfevents"):
return True
return False
def _pick_logdir(cwd: Path) -> Path:
"""Prefere ./logs ou ./results quando contem ficheiros events.out.tfevents*."""
candidates = [cwd / "logs", cwd / "results"]
for directory in candidates:
if directory.is_dir() and _has_tfevents(directory):
return directory
for directory in candidates:
if directory.is_dir():
return directory
return cwd / "results"
def _can_bind(host: str, port: int) -> bool:
bind_host = "127.0.0.1" if host in ("127.0.0.1", "localhost") else host
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
s.bind((bind_host, port))
return True
except OSError:
return False
def _resolve_port(host: str, preferred: int, max_attempts: int = 10) -> int:
for offset in range(max_attempts):
port = preferred + offset
if _can_bind(host, port):
return port
return preferred
def _tensorboard_module_cmd(logdir: Path, host: str, port: int, reload_interval: int) -> list[str]:
return [
sys.executable,
"-m",
"tensorboard.main",
"--logdir",
str(logdir.resolve()),
"--host",
host,
"--port",
str(port),
"--reload_interval",
str(reload_interval),
]
def _stderr_bind_failure(text: str) -> bool:
low = text.lower()
return "could not bind" in low or "already in use" in low
def _drain_stderr(stream: object) -> None:
try:
for line in stream: # type: ignore[union-attr]
sys.stderr.write(line)
except Exception:
pass
def _run_tensorboard_process(
cmd: list[str],
host: str,
port: int,
quick_fail_seconds: float = 1.25,
) -> tuple[int, bool]:
"""
Devolve (codigo_saida, tentar_proxima_porta). tentar_proxima_porta e True quando a
falha parece ser bind/porta (corrida com _can_bind ou outro processo).
"""
proc = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
assert proc.stderr is not None
deadline = time.monotonic() + quick_fail_seconds
while time.monotonic() < deadline:
if proc.poll() is not None:
err = proc.stderr.read()
code = proc.returncode if proc.returncode is not None else 1
retry_port = code != 0 and _stderr_bind_failure(err)
if err:
sys.stderr.write(err)
return code, retry_port
time.sleep(0.05)
threading.Thread(target=_drain_stderr, args=(proc.stderr,), daemon=True).start()
print(f"Abre no browser: http://{host}:{port}/")
try:
return proc.wait(), False
except KeyboardInterrupt:
proc.terminate()
try:
proc.wait(timeout=8)
except subprocess.TimeoutExpired:
proc.kill()
return 130, False
def _start_tensorboard_with_port_fallback(
logdir: Path,
host: str,
preferred_port: int,
reload_interval: int,
max_attempts: int = 10,
) -> int:
preferred = preferred_port
if not _can_bind(host, preferred):
alt = _resolve_port(host, preferred)
if alt != preferred:
print(
f"Porta {preferred} ocupada (ex.: outro TensorBoard ou Docker). A usar {alt}.",
file=sys.stderr,
)
preferred = alt
for offset in range(max_attempts):
port = preferred + offset
if offset > 0 and not _can_bind(host, port):
continue
cmd = _tensorboard_module_cmd(
logdir=logdir,
host=host,
port=port,
reload_interval=reload_interval,
)
print(f"Iniciando TensorBoard: {' '.join(cmd)}")
rc, retry_port = _run_tensorboard_process(cmd, host=host, port=port)
if rc == 0:
return 0
if retry_port and offset + 1 < max_attempts:
print(
f"Porta {port} indisponivel ao iniciar. A tentar {port + 1}...",
file=sys.stderr,
)
continue
return rc
print("Erro: nao foi possivel abrir o TensorBoard em nenhuma porta tentada.", file=sys.stderr)
return 1
def main() -> int:
parser = argparse.ArgumentParser(
description="Abre TensorBoard nos logs gerados pelo train.py (report_to=tensorboard).",
)
parser.add_argument(
"--logdir",
type=Path,
default=None,
help=(
"Diretorio com events.out.tfevents.* "
"(default: auto procura em ./logs e ./results)."
),
)
parser.add_argument(
"--host",
default="127.0.0.1",
help="Interface de escuta (default: 127.0.0.1).",
)
parser.add_argument(
"--port",
type=int,
default=6006,
help="Porta HTTP preferida (default: 6006). Se estiver ocupada, usa a seguinte livre.",
)
parser.add_argument(
"--reload_interval",
type=int,
default=5,
help="Segundos entre recargas ao detetar novos eventos (default: 5).",
)
args = parser.parse_args()
cwd = Path.cwd()
logdir = args.logdir if args.logdir is not None else _pick_logdir(cwd)
logdir = logdir.resolve()
if args.logdir is None:
print(f"TensorBoard --logdir (auto): {logdir}", file=sys.stderr)
if not logdir.exists():
print(f"Erro: diretorio nao existe: {logdir}", file=sys.stderr)
return 1
if not _has_tfevents(logdir):
print(
"Aviso: nenhum ficheiro events.out.tfevents* encontrado sob este diretorio. "
"O TensorBoard pode ficar sem escalares. Se descarregou os logs do servidor, "
"use --logdir com a pasta onde estao os eventos (ex.: ./logs).",
file=sys.stderr,
)
try:
import tensorboard # noqa: F401
except ImportError:
print(
"Erro: pacote 'tensorboard' nao instalado. Execute: pip install tensorboard",
file=sys.stderr,
)
return 1
return _start_tensorboard_with_port_fallback(
logdir=logdir,
host=args.host,
preferred_port=args.port,
reload_interval=args.reload_interval,
)
if __name__ == "__main__":
raise SystemExit(main())