hermes-bot / scripts /selfheal.py
Z User
feat: add DuckDuckGo free fallback for web_search (no API key needed)
4036608
#!/usr/bin/env python3
"""
Hermes 自愈脚本 - 进程/内存/OOM/配置漂移检测
通过 cronjob 定期调用,异常时自动修复或告警
"""
import subprocess
import json
import os
import sys
from datetime import datetime
LOG_FILE = "/tmp/hermes-selfheal.log"
DATA_DIR = "/data/hermes"
def log(msg, level="INFO"):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}"
print(line)
try:
with open(LOG_FILE, "a") as f:
f.write(line + "\n")
except Exception:
pass
def check_memory():
"""检查内存使用,过高时自动清理"""
try:
result = subprocess.run(
["free", "-m"], capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return
lines = result.stdout.strip().split("\n")
parts = lines[1].split()
used_mb = int(parts[2])
total_mb = int(parts[1])
percent = round(used_mb / total_mb * 100, 1)
log(f"内存: {used_mb}/{total_mb}MB ({percent}%)")
if percent > 90:
log("内存超过 90%,执行清理", "WARN")
cleanup_actions = []
# 清理旧日志
for log_dir in ["/data/hermes/logs", "/tmp/hermes/logs", "/app/logs"]:
if os.path.exists(log_dir):
try:
result = subprocess.run(
["find", log_dir, "-name", "*.log", "-mtime", "+7", "-delete"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append(f"清理 {log_dir} 7天前日志")
except Exception as e:
log(f"清理日志失败: {e}", "ERROR")
# 清理 pip 缓存
try:
subprocess.run(
["pip", "cache", "purge"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append("清理 pip 缓存")
except Exception:
pass
# 清理 /tmp 旧文件
try:
subprocess.run(
["find", "/tmp", "-type", "f", "-mtime", "+3", "-delete"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append("清理 /tmp 3天前文件")
except Exception:
pass
log(f"清理完成: {'; '.join(cleanup_actions)}")
elif percent > 85:
log("内存超过 85%,建议关注", "WARN")
except Exception as e:
log(f"内存检查失败: {e}", "ERROR")
def check_disk():
"""检查磁盘使用"""
try:
result = subprocess.run(
["df", "-m", "/data"], capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return
lines = result.stdout.strip().split("\n")
if len(lines) < 2:
return
parts = lines[1].split()
used_mb = int(parts[2])
total_mb = int(parts[1])
percent = round(used_mb / total_mb * 100, 1)
log(f"磁盘: {used_mb}/{total_mb}MB ({percent}%)")
if percent > 90:
log("磁盘超过 90%,清理旧数据", "WARN")
for old_dir in ["/data/hermes/logs", "/data/hermes/uploads"]:
if os.path.exists(old_dir):
subprocess.run(
["find", old_dir, "-type", "f", "-mtime", "+14", "-delete"],
capture_output=True, text=True, timeout=15,
)
except Exception as e:
log(f"磁盘检查失败: {e}", "ERROR")
def check_process():
"""检查 Hermes 进程状态"""
try:
# 检查 Python 进程(Gateway)
result = subprocess.run(
["pgrep", "-f", "entry.py"], capture_output=True, text=True, timeout=5
)
gateway_running = result.returncode == 0
# 检查 Dashboard
result = subprocess.run(
["pgrep", "-f", "7860"], capture_output=True, text=True, timeout=5
)
dashboard_running = result.returncode == 0
log(f"Gateway: {'运行中' if gateway_running else '未运行'}")
log(f"Dashboard: {'运行中' if dashboard_running else '未运行'}")
if not gateway_running:
log("Gateway 未运行!", "ERROR")
# 尝试重启
try:
subprocess.run(
["bash", "/app/start.sh"],
capture_output=True, text=True, timeout=30,
)
log("已尝试重启 Gateway", "WARN")
except Exception as e:
log(f"重启失败: {e}", "ERROR")
except Exception as e:
log(f"进程检查失败: {e}", "ERROR")
def check_config_drift():
"""检查配置文件是否被意外修改"""
import hashlib
config_files = {
"SOUL.md": "/app/SOUL.md",
"config.yaml": "/app/config.yaml",
}
hash_file = os.path.join(DATA_DIR, ".config_hashes.json")
try:
saved_hashes = {}
if os.path.exists(hash_file):
with open(hash_file, "r") as f:
saved_hashes = json.load(f)
current_hashes = {}
for name, path in config_files.items():
if os.path.exists(path):
with open(path, "rb") as f:
current_hashes[name] = hashlib.md5(f.read()).hexdigest()
drift = {}
for name, h in current_hashes.items():
if name in saved_hashes and saved_hashes[name] != h:
drift[name] = f"hash changed from {saved_hashes[name][:8]} to {h[:8]}"
if drift:
log(f"配置漂移检测: {drift}", "WARN")
else:
log("配置文件无漂移")
# 更新保存的 hash
with open(hash_file, "w") as f:
json.dump(current_hashes, f, indent=2)
except Exception as e:
log(f"配置漂移检测失败: {e}", "ERROR")
def check_feishu_connection():
"""检查飞书 WebSocket 连接"""
try:
result = subprocess.run(
["pgrep", "-f", "websocket"], capture_output=True, text=True, timeout=5
)
connected = result.returncode == 0
log(f"飞书 WebSocket: {'已连接' if connected else '可能断开'}")
if not connected:
log("飞书连接可能断开,建议检查", "WARN")
except Exception as e:
log(f"飞书连接检查失败: {e}", "ERROR")
def main():
log("=" * 40)
log("自愈检查启动")
# 日志轮转
try:
if os.path.exists(LOG_FILE):
size = os.path.getsize(LOG_FILE)
if size > 1024 * 100: # 100KB
os.rename(LOG_FILE, LOG_FILE + ".bak")
log("日志轮转完成")
except Exception:
pass
check_process()
check_memory()
check_disk()
check_config_drift()
check_feishu_connection()
log("自愈检查完成")
log("=" * 40)
if __name__ == "__main__":
main()