rahul7star's picture
Create debug.py
07112eb verified
import os
import sys
import json
import time
import socket
import platform
import subprocess
import traceback
import datetime
from typing import Any, Dict
from fastapi import APIRouter
router = APIRouter()
# =========================================================
# SAFE EXECUTION WRAPPER (NEVER FAIL)
# =========================================================
def safe_run(name, func):
start = time.time()
try:
return {
"status": "ok",
"duration_sec": round(time.time() - start, 3),
"data": func(),
}
except Exception as e:
return {
"status": "error",
"duration_sec": round(time.time() - start, 3),
"error": str(e),
"traceback": traceback.format_exc(limit=2),
}
# =========================================================
# COMMAND RUNNER
# =========================================================
def run_cmd(cmd):
try:
r = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=25,
)
return {
"cmd": " ".join(cmd),
"returncode": r.returncode,
"stdout": r.stdout.strip(),
"stderr": r.stderr.strip(),
}
except Exception as e:
return {"cmd": " ".join(cmd), "error": str(e)}
# =========================================================
# SYSTEM INFO
# =========================================================
def system_info():
return {
"time_utc": datetime.datetime.utcnow().isoformat(),
"hostname": socket.gethostname(),
"platform": platform.platform(),
"python": sys.version,
"executable": sys.executable,
"cwd": os.getcwd(),
"pid": os.getpid(),
"cpu_count": os.cpu_count(),
}
# =========================================================
# ENVIRONMENT VARIABLES (MASK SECRETS)
# =========================================================
def env_info():
masked = {}
for k, v in os.environ.items():
if any(x in k.lower() for x in ["token", "secret", "password", "key"]):
masked[k] = "***hidden***"
else:
masked[k] = v
return masked
# =========================================================
# GPU / TORCH INFO
# =========================================================
def gpu_info():
try:
import torch
data = {
"cuda_available": torch.cuda.is_available(),
"device_count": torch.cuda.device_count(),
"torch_version": torch.__version__,
}
if torch.cuda.is_available():
data.update({
"device_name": torch.cuda.get_device_name(0),
"memory_allocated": torch.cuda.memory_allocated(0),
"memory_reserved": torch.cuda.memory_reserved(0),
"memory_total": torch.cuda.get_device_properties(0).total_memory,
})
return data
except Exception as e:
return {"torch_error": str(e)}
# =========================================================
# HUGGING FACE CACHE
# =========================================================
def hf_cache_info():
cache_dir = os.getenv("HF_HOME", "/tmp/huggingface")
return {
"cache_dir": cache_dir,
"exists": os.path.exists(cache_dir),
"files_sample": os.listdir(cache_dir)[:30]
if os.path.exists(cache_dir)
else [],
}
# =========================================================
# INSTALLED PACKAGES
# =========================================================
def packages_info():
try:
import pkg_resources
return sorted(
[f"{p.project_name}=={p.version}" for p in pkg_resources.working_set]
)
except Exception as e:
return {"error": str(e)}
# =========================================================
# DISK + MEMORY
# =========================================================
def disk_info():
return {
"disk_usage": run_cmd(["df", "-h"]),
"memory": run_cmd(["free", "-h"]),
}
# =========================================================
# NETWORK INFO
# =========================================================
def network_info():
return {
"hostname": socket.gethostname(),
"ip": socket.gethostbyname(socket.gethostname()),
}
# =========================================================
# HF CLI CHECKS
# =========================================================
def hf_cli_info():
return {
"whoami": run_cmd(["huggingface-cli", "whoami"]),
"scan_cache": run_cmd(["huggingface-cli", "scan-cache"]),
}
# =========================================================
# PYTHON RUNTIME STATE
# =========================================================
def runtime_info():
import threading
return {
"active_threads": [t.name for t in threading.enumerate()],
"loaded_modules_count": len(sys.modules),
}
# =========================================================
# SPACE DETECTION (HF SPECIFIC)
# =========================================================
def hf_space_info():
return {
"is_space": "SPACE_ID" in os.environ,
"space_id": os.getenv("SPACE_ID"),
"hardware": os.getenv("SPACE_HARDWARE"),
"sdk": os.getenv("SPACE_SDK"),
}
# =========================================================
# MAIN UNIVERSAL DEBUG ENDPOINT
# =========================================================
@router.get("/debug/full", include_in_schema=False)
def full_debug() -> Dict[str, Any]:
report = {
"system": safe_run("system", system_info),
"environment": safe_run("env", env_info),
"gpu": safe_run("gpu", gpu_info),
"disk": safe_run("disk", disk_info),
"network": safe_run("network", network_info),
"hf_space": safe_run("hf_space", hf_space_info),
"hf_cache": safe_run("hf_cache", hf_cache_info),
"hf_cli": safe_run("hf_cli", hf_cli_info),
"runtime": safe_run("runtime", runtime_info),
"packages": safe_run("packages", packages_info),
}
return {
"status": "ok",
"generated_at": datetime.datetime.utcnow().isoformat(),
"report": report,
}