File size: 5,708 Bytes
b2d9e47
 
169f06f
b2d9e47
 
 
 
a7a7e59
b2d9e47
 
 
e24267e
169f06f
b2d9e47
 
169f06f
b2d9e47
 
 
 
 
169f06f
 
 
b2d9e47
169f06f
 
 
 
 
 
 
 
 
 
a7a7e59
b2d9e47
 
 
169f06f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2d9e47
 
 
 
 
169f06f
 
 
b2d9e47
169f06f
b2d9e47
169f06f
 
b2d9e47
169f06f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2d9e47
 
 
 
 
 
169f06f
 
 
 
 
 
 
 
a7a7e59
169f06f
a7a7e59
 
 
 
 
b2d9e47
 
 
 
 
 
 
 
 
 
169f06f
e24267e
 
169f06f
 
 
 
 
 
 
 
 
e24267e
 
 
 
 
 
 
 
a7a7e59
169f06f
a7a7e59
 
e24267e
 
 
 
b2d9e47
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
"""
Sync agentmemory data to/from a private HF Dataset repo.
Usage:
  python3 sync.py restore   -- download DB from HF on startup
  python3 sync.py backup    -- upload DB to HF (called in loop)
"""
import json
import os
import sys
import shutil
import tempfile
import time

try:
    from huggingface_hub import HfApi, snapshot_download, hf_hub_download
    from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
except ImportError:
    print("[sync] huggingface_hub not installed, skipping sync")
    sys.exit(0)

HF_TOKEN = os.environ.get("HF_TOKEN", "")
REPO_ID  = os.environ.get("AGENTMEMORY_DATASET_REPO", "Yash030/agentmemory-python-data")
DATA_DIR = os.path.expanduser("~/.agentmemory")

# Only these paths are backed up/restored — everything else is ephemeral
SYNC_FILES = [
    "agentmemory.db",
    ".hmac",
]
SYNC_DIRS = [
    "second-brain",
]

STATE_FILE = os.path.join(DATA_DIR, ".backup_state")

def get_api():
    return HfApi(token=HF_TOKEN)

def _collect_sync_targets():
    """Return list of (abs_path, repo_rel_path) for all files to sync."""
    targets = []
    for fname in SYNC_FILES:
        full = os.path.join(DATA_DIR, fname)
        if os.path.isfile(full):
            targets.append((full, fname))
    for dname in SYNC_DIRS:
        dpath = os.path.join(DATA_DIR, dname)
        if os.path.isdir(dpath):
            for root, _, files in os.walk(dpath):
                for f in files:
                    full = os.path.join(root, f)
                    rel  = os.path.relpath(full, DATA_DIR).replace("\\", "/")
                    targets.append((full, rel))
    return targets

def _state_fingerprint(targets):
    entries = {}
    for full, rel in targets:
        try:
            s = os.stat(full)
            entries[rel] = (s.st_size, s.st_mtime)
        except OSError:
            pass
    return json.dumps(entries, sort_keys=True)

def restore():
    if not HF_TOKEN:
        print("[sync] No HF_TOKEN — skipping restore")
        return
    os.makedirs(DATA_DIR, exist_ok=True)
    api = get_api()

    # Check repo exists
    try:
        api.repo_info(REPO_ID, repo_type="dataset")
    except RepositoryNotFoundError:
        print(f"[sync] Dataset repo {REPO_ID} not found — fresh start")
        return
    except Exception as e:
        print(f"[sync] restore repo check error: {e}")
        return

    # Download each sync target individually
    all_targets = SYNC_FILES + [
        f for f in _list_repo_prefix(api, "second-brain/")
    ]

    if not all_targets:
        print("[sync] Dataset empty — fresh start")
        return

    for fname in all_targets:
        try:
            local_path = os.path.join(DATA_DIR, fname)
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            hf_hub_download(
                repo_id=REPO_ID,
                filename=fname,
                repo_type="dataset",
                token=HF_TOKEN,
                local_dir=DATA_DIR,
                local_dir_use_symlinks=False,
            )
            print(f"[sync] restored {fname}")
        except EntryNotFoundError:
            pass  # file not yet in repo, skip
        except Exception as e:
            print(f"[sync] restore {fname} error: {e}")

    print("[sync] restore complete")

def _list_repo_prefix(api, prefix):
    """List files in repo matching a path prefix."""
    try:
        from huggingface_hub import list_repo_files
        return [f for f in list_repo_files(REPO_ID, repo_type="dataset", token=HF_TOKEN)
                if f.startswith(prefix)]
    except Exception:
        return []

def backup():
    if not HF_TOKEN:
        return
    api = get_api()

    targets = _collect_sync_targets()
    if not targets:
        print("[sync] nothing to backup")
        return

    # Fast change detection
    current_state = _state_fingerprint(targets)
    if os.path.exists(STATE_FILE):
        try:
            if open(STATE_FILE).read() == current_state:
                print("[sync] no changes — skipping backup")
                return
        except Exception:
            pass

    # Ensure repo exists
    try:
        api.repo_info(REPO_ID, repo_type="dataset")
    except RepositoryNotFoundError:
        print(f"[sync] Creating dataset repo {REPO_ID}")
        api.create_repo(REPO_ID, repo_type="dataset", private=True)
    except Exception as e:
        print(f"[sync] repo_info error: {e}")
        return

    # Stage only the targeted files
    staging = tempfile.mkdtemp(prefix="agentmemory_sync_")
    try:
        for full, rel in targets:
            dest = os.path.join(staging, rel.replace("/", os.sep))
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            try:
                shutil.copy2(full, dest)
            except Exception as e:
                print(f"[sync] stage {rel} error: {e}")

        print(f"[sync] uploading {len(targets)} files to {REPO_ID}...")
        api.upload_folder(
            folder_path=staging,
            repo_id=REPO_ID,
            repo_type="dataset",
            token=HF_TOKEN,
            commit_message="sync: periodic backup",
        )
        print("[sync] backup complete")
        try:
            open(STATE_FILE, "w").write(current_state)
        except Exception:
            pass
    except Exception as e:
        print(f"[sync] backup error: {e}")
    finally:
        shutil.rmtree(staging, ignore_errors=True)

if __name__ == "__main__":
    cmd = sys.argv[1] if len(sys.argv) > 1 else "backup"
    if cmd == "restore":
        restore()
    elif cmd == "backup":
        backup()
    else:
        print(f"[sync] unknown command: {cmd}")
        sys.exit(1)