Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Hermes 自愈脚本 - 进程/内存/OOM/配置漂移检测 | |
| 通过 cronjob 定期调用,异常时自动修复或告警 | |
| """ | |
| import subprocess | |
| import json | |
| import os | |
| import sys | |
| from datetime import datetime | |
| LOG_FILE = "/tmp/hermes-selfheal.log" | |
| DATA_DIR = "/data/hermes" | |
| def log(msg, level="INFO"): | |
| ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| line = f"[{ts}] [{level}] {msg}" | |
| print(line) | |
| try: | |
| with open(LOG_FILE, "a") as f: | |
| f.write(line + "\n") | |
| except Exception: | |
| pass | |
| def check_memory(): | |
| """检查内存使用,过高时自动清理""" | |
| try: | |
| result = subprocess.run( | |
| ["free", "-m"], capture_output=True, text=True, timeout=5 | |
| ) | |
| if result.returncode != 0: | |
| return | |
| lines = result.stdout.strip().split("\n") | |
| parts = lines[1].split() | |
| used_mb = int(parts[2]) | |
| total_mb = int(parts[1]) | |
| percent = round(used_mb / total_mb * 100, 1) | |
| log(f"内存: {used_mb}/{total_mb}MB ({percent}%)") | |
| if percent > 90: | |
| log("内存超过 90%,执行清理", "WARN") | |
| cleanup_actions = [] | |
| # 清理旧日志 | |
| for log_dir in ["/data/hermes/logs", "/tmp/hermes/logs", "/app/logs"]: | |
| if os.path.exists(log_dir): | |
| try: | |
| result = subprocess.run( | |
| ["find", log_dir, "-name", "*.log", "-mtime", "+7", "-delete"], | |
| capture_output=True, text=True, timeout=10, | |
| ) | |
| cleanup_actions.append(f"清理 {log_dir} 7天前日志") | |
| except Exception as e: | |
| log(f"清理日志失败: {e}", "ERROR") | |
| # 清理 pip 缓存 | |
| try: | |
| subprocess.run( | |
| ["pip", "cache", "purge"], | |
| capture_output=True, text=True, timeout=10, | |
| ) | |
| cleanup_actions.append("清理 pip 缓存") | |
| except Exception: | |
| pass | |
| # 清理 /tmp 旧文件 | |
| try: | |
| subprocess.run( | |
| ["find", "/tmp", "-type", "f", "-mtime", "+3", "-delete"], | |
| capture_output=True, text=True, timeout=10, | |
| ) | |
| cleanup_actions.append("清理 /tmp 3天前文件") | |
| except Exception: | |
| pass | |
| log(f"清理完成: {'; '.join(cleanup_actions)}") | |
| elif percent > 85: | |
| log("内存超过 85%,建议关注", "WARN") | |
| except Exception as e: | |
| log(f"内存检查失败: {e}", "ERROR") | |
| def check_disk(): | |
| """检查磁盘使用""" | |
| try: | |
| result = subprocess.run( | |
| ["df", "-m", "/data"], capture_output=True, text=True, timeout=5 | |
| ) | |
| if result.returncode != 0: | |
| return | |
| lines = result.stdout.strip().split("\n") | |
| if len(lines) < 2: | |
| return | |
| parts = lines[1].split() | |
| used_mb = int(parts[2]) | |
| total_mb = int(parts[1]) | |
| percent = round(used_mb / total_mb * 100, 1) | |
| log(f"磁盘: {used_mb}/{total_mb}MB ({percent}%)") | |
| if percent > 90: | |
| log("磁盘超过 90%,清理旧数据", "WARN") | |
| for old_dir in ["/data/hermes/logs", "/data/hermes/uploads"]: | |
| if os.path.exists(old_dir): | |
| subprocess.run( | |
| ["find", old_dir, "-type", "f", "-mtime", "+14", "-delete"], | |
| capture_output=True, text=True, timeout=15, | |
| ) | |
| except Exception as e: | |
| log(f"磁盘检查失败: {e}", "ERROR") | |
| def check_process(): | |
| """检查 Hermes 进程状态""" | |
| try: | |
| # 检查 Python 进程(Gateway) | |
| result = subprocess.run( | |
| ["pgrep", "-f", "entry.py"], capture_output=True, text=True, timeout=5 | |
| ) | |
| gateway_running = result.returncode == 0 | |
| # 检查 Dashboard | |
| result = subprocess.run( | |
| ["pgrep", "-f", "7860"], capture_output=True, text=True, timeout=5 | |
| ) | |
| dashboard_running = result.returncode == 0 | |
| log(f"Gateway: {'运行中' if gateway_running else '未运行'}") | |
| log(f"Dashboard: {'运行中' if dashboard_running else '未运行'}") | |
| if not gateway_running: | |
| log("Gateway 未运行!", "ERROR") | |
| # 尝试重启 | |
| try: | |
| subprocess.run( | |
| ["bash", "/app/start.sh"], | |
| capture_output=True, text=True, timeout=30, | |
| ) | |
| log("已尝试重启 Gateway", "WARN") | |
| except Exception as e: | |
| log(f"重启失败: {e}", "ERROR") | |
| except Exception as e: | |
| log(f"进程检查失败: {e}", "ERROR") | |
| def check_config_drift(): | |
| """检查配置文件是否被意外修改""" | |
| import hashlib | |
| config_files = { | |
| "SOUL.md": "/app/SOUL.md", | |
| "config.yaml": "/app/config.yaml", | |
| } | |
| hash_file = os.path.join(DATA_DIR, ".config_hashes.json") | |
| try: | |
| saved_hashes = {} | |
| if os.path.exists(hash_file): | |
| with open(hash_file, "r") as f: | |
| saved_hashes = json.load(f) | |
| current_hashes = {} | |
| for name, path in config_files.items(): | |
| if os.path.exists(path): | |
| with open(path, "rb") as f: | |
| current_hashes[name] = hashlib.md5(f.read()).hexdigest() | |
| drift = {} | |
| for name, h in current_hashes.items(): | |
| if name in saved_hashes and saved_hashes[name] != h: | |
| drift[name] = f"hash changed from {saved_hashes[name][:8]} to {h[:8]}" | |
| if drift: | |
| log(f"配置漂移检测: {drift}", "WARN") | |
| else: | |
| log("配置文件无漂移") | |
| # 更新保存的 hash | |
| with open(hash_file, "w") as f: | |
| json.dump(current_hashes, f, indent=2) | |
| except Exception as e: | |
| log(f"配置漂移检测失败: {e}", "ERROR") | |
| def check_feishu_connection(): | |
| """检查飞书 WebSocket 连接""" | |
| try: | |
| result = subprocess.run( | |
| ["pgrep", "-f", "websocket"], capture_output=True, text=True, timeout=5 | |
| ) | |
| connected = result.returncode == 0 | |
| log(f"飞书 WebSocket: {'已连接' if connected else '可能断开'}") | |
| if not connected: | |
| log("飞书连接可能断开,建议检查", "WARN") | |
| except Exception as e: | |
| log(f"飞书连接检查失败: {e}", "ERROR") | |
| def main(): | |
| log("=" * 40) | |
| log("自愈检查启动") | |
| # 日志轮转 | |
| try: | |
| if os.path.exists(LOG_FILE): | |
| size = os.path.getsize(LOG_FILE) | |
| if size > 1024 * 100: # 100KB | |
| os.rename(LOG_FILE, LOG_FILE + ".bak") | |
| log("日志轮转完成") | |
| except Exception: | |
| pass | |
| check_process() | |
| check_memory() | |
| check_disk() | |
| check_config_drift() | |
| check_feishu_connection() | |
| log("自愈检查完成") | |
| log("=" * 40) | |
| if __name__ == "__main__": | |
| main() | |