| |
| """ |
| Cache Validation Script: Verify cache files have complete offline data. |
| |
| Usage: |
| python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100 |
| """ |
|
|
| import argparse |
| import torch |
| from pathlib import Path |
| import random |
| from collections import Counter |
|
|
|
|
| def validate_cache_file(filepath): |
| """Validate a single cache file has all required fields.""" |
| try: |
| data = torch.load(filepath, map_location='cpu', weights_only=False) |
|
|
| issues = [] |
|
|
| |
| base_fields = [ |
| "mint_timestamp", "token_address", "creator_address", |
| "trades", "transfers", "quality_score", "ohlc_1s" |
| ] |
| for field in base_fields: |
| if field not in data: |
| issues.append(f"Missing base field: {field}") |
|
|
| |
| if 'cached_wallet_data' not in data: |
| issues.append("Missing cached_wallet_data (v2)") |
| else: |
| wallet_data = data['cached_wallet_data'] |
| if 'profiles' not in wallet_data: |
| issues.append("Missing cached_wallet_data.profiles") |
| if 'socials' not in wallet_data: |
| issues.append("Missing cached_wallet_data.socials") |
| if 'holdings' not in wallet_data: |
| issues.append("Missing cached_wallet_data.holdings") |
|
|
| if 'cached_graph_data' not in data: |
| issues.append("Missing cached_graph_data (v2)") |
| else: |
| graph_data = data['cached_graph_data'] |
| if 'entities' not in graph_data: |
| issues.append("Missing cached_graph_data.entities") |
| if 'links' not in graph_data: |
| issues.append("Missing cached_graph_data.links") |
|
|
| |
| has_image = data.get('cached_image_bytes') is not None |
|
|
| |
| stats = { |
| 'num_trades': len(data.get('trades', [])), |
| 'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})), |
| 'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})), |
| 'has_image': has_image, |
| 'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None, |
| } |
|
|
| return { |
| 'valid': len(issues) == 0, |
| 'issues': issues, |
| 'stats': stats, |
| 'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data |
| } |
|
|
| except Exception as e: |
| return { |
| 'valid': False, |
| 'issues': [f"Load error: {str(e)}"], |
| 'stats': {}, |
| 'is_v2': False |
| } |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Validate cache files for offline training") |
| parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory") |
| parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample") |
| parser.add_argument("--full", action="store_true", help="Check all files (slow)") |
| args = parser.parse_args() |
|
|
| cache_dir = Path(args.cache_dir) |
| if not cache_dir.exists(): |
| print(f"ERROR: Cache directory not found: {cache_dir}") |
| return |
|
|
| cache_files = list(cache_dir.glob("sample_*.pt")) |
| print(f"Found {len(cache_files)} cache files") |
|
|
| if not args.full and len(cache_files) > args.sample_size: |
| cache_files = random.sample(cache_files, args.sample_size) |
| print(f"Sampling {len(cache_files)} files for validation") |
|
|
| |
| results = [] |
| for f in cache_files: |
| result = validate_cache_file(f) |
| result['filepath'] = f |
| results.append(result) |
|
|
| |
| valid_count = sum(1 for r in results if r['valid']) |
| v2_count = sum(1 for r in results if r['is_v2']) |
| has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False)) |
|
|
| print("\n" + "="*60) |
| print("VALIDATION SUMMARY") |
| print("="*60) |
| print(f" Total checked: {len(results)}") |
| print(f" Valid: {valid_count} ({100*valid_count/len(results):.1f}%)") |
| print(f" V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)") |
| print(f" Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)") |
|
|
| |
| all_issues = [] |
| for r in results: |
| all_issues.extend(r['issues']) |
|
|
| if all_issues: |
| print("\nIssue breakdown:") |
| for issue, count in Counter(all_issues).most_common(): |
| print(f" {issue}: {count}") |
|
|
| |
| v2_results = [r for r in results if r['is_v2']] |
| if v2_results: |
| avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results) |
| avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results) |
| print(f"\nV2 file stats:") |
| print(f" Avg wallets cached: {avg_wallets:.1f}") |
| print(f" Avg trades: {avg_trades:.1f}") |
|
|
| |
| invalid_results = [r for r in results if not r['valid']] |
| if invalid_results: |
| print("\nSample invalid files:") |
| for r in invalid_results[:5]: |
| print(f" {r['filepath'].name}: {r['issues'][:2]}") |
|
|
| |
| print("\n" + "="*60) |
| if v2_count == len(results): |
| print("All files are V2 format. Ready for offline training!") |
| elif v2_count == 0: |
| print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir)) |
| else: |
| print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.") |
| print("Old format files will use empty wallet/graph data during training.") |
| print("Run migrate_cache_v2.py to upgrade remaining files.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|