oracle / scripts /validate_cache_v2.py
zirobtc's picture
Upload folder using huggingface_hub
e605733
#!/usr/bin/env python3
"""
Cache Validation Script: Verify cache files have complete offline data.
Usage:
python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100
"""
import argparse
import torch
from pathlib import Path
import random
from collections import Counter
def validate_cache_file(filepath):
"""Validate a single cache file has all required fields."""
try:
data = torch.load(filepath, map_location='cpu', weights_only=False)
issues = []
# Required base fields
base_fields = [
"mint_timestamp", "token_address", "creator_address",
"trades", "transfers", "quality_score", "ohlc_1s"
]
for field in base_fields:
if field not in data:
issues.append(f"Missing base field: {field}")
# New v2 fields for offline mode
if 'cached_wallet_data' not in data:
issues.append("Missing cached_wallet_data (v2)")
else:
wallet_data = data['cached_wallet_data']
if 'profiles' not in wallet_data:
issues.append("Missing cached_wallet_data.profiles")
if 'socials' not in wallet_data:
issues.append("Missing cached_wallet_data.socials")
if 'holdings' not in wallet_data:
issues.append("Missing cached_wallet_data.holdings")
if 'cached_graph_data' not in data:
issues.append("Missing cached_graph_data (v2)")
else:
graph_data = data['cached_graph_data']
if 'entities' not in graph_data:
issues.append("Missing cached_graph_data.entities")
if 'links' not in graph_data:
issues.append("Missing cached_graph_data.links")
# Image is optional but good to have
has_image = data.get('cached_image_bytes') is not None
# Collect stats
stats = {
'num_trades': len(data.get('trades', [])),
'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})),
'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})),
'has_image': has_image,
'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None,
}
return {
'valid': len(issues) == 0,
'issues': issues,
'stats': stats,
'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data
}
except Exception as e:
return {
'valid': False,
'issues': [f"Load error: {str(e)}"],
'stats': {},
'is_v2': False
}
def main():
parser = argparse.ArgumentParser(description="Validate cache files for offline training")
parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory")
parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample")
parser.add_argument("--full", action="store_true", help="Check all files (slow)")
args = parser.parse_args()
cache_dir = Path(args.cache_dir)
if not cache_dir.exists():
print(f"ERROR: Cache directory not found: {cache_dir}")
return
cache_files = list(cache_dir.glob("sample_*.pt"))
print(f"Found {len(cache_files)} cache files")
if not args.full and len(cache_files) > args.sample_size:
cache_files = random.sample(cache_files, args.sample_size)
print(f"Sampling {len(cache_files)} files for validation")
# Validate files
results = []
for f in cache_files:
result = validate_cache_file(f)
result['filepath'] = f
results.append(result)
# Summary
valid_count = sum(1 for r in results if r['valid'])
v2_count = sum(1 for r in results if r['is_v2'])
has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False))
print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)
print(f" Total checked: {len(results)}")
print(f" Valid: {valid_count} ({100*valid_count/len(results):.1f}%)")
print(f" V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)")
print(f" Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)")
# Issue breakdown
all_issues = []
for r in results:
all_issues.extend(r['issues'])
if all_issues:
print("\nIssue breakdown:")
for issue, count in Counter(all_issues).most_common():
print(f" {issue}: {count}")
# Stats for valid v2 files
v2_results = [r for r in results if r['is_v2']]
if v2_results:
avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results)
avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results)
print(f"\nV2 file stats:")
print(f" Avg wallets cached: {avg_wallets:.1f}")
print(f" Avg trades: {avg_trades:.1f}")
# Show sample invalid files
invalid_results = [r for r in results if not r['valid']]
if invalid_results:
print("\nSample invalid files:")
for r in invalid_results[:5]:
print(f" {r['filepath'].name}: {r['issues'][:2]}")
# Recommendation
print("\n" + "="*60)
if v2_count == len(results):
print("All files are V2 format. Ready for offline training!")
elif v2_count == 0:
print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir))
else:
print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.")
print("Old format files will use empty wallet/graph data during training.")
print("Run migrate_cache_v2.py to upgrade remaining files.")
if __name__ == "__main__":
main()