oracle / scripts /validate_cache_v2.py

Upload folder using huggingface_hub

e605733 about 2 months ago

5.86 kB

	#!/usr/bin/env python3
	"""
	Cache Validation Script: Verify cache files have complete offline data.

	Usage:
	python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100
	"""

	import argparse
	import torch
	from pathlib import Path
	import random
	from collections import Counter


	def validate_cache_file(filepath):
	"""Validate a single cache file has all required fields."""
	try:
	data = torch.load(filepath, map_location='cpu', weights_only=False)

	issues = []

	# Required base fields
	base_fields = [
	"mint_timestamp", "token_address", "creator_address",
	"trades", "transfers", "quality_score", "ohlc_1s"
	]
	for field in base_fields:
	if field not in data:
	issues.append(f"Missing base field: {field}")

	# New v2 fields for offline mode
	if 'cached_wallet_data' not in data:
	issues.append("Missing cached_wallet_data (v2)")
	else:
	wallet_data = data['cached_wallet_data']
	if 'profiles' not in wallet_data:
	issues.append("Missing cached_wallet_data.profiles")
	if 'socials' not in wallet_data:
	issues.append("Missing cached_wallet_data.socials")
	if 'holdings' not in wallet_data:
	issues.append("Missing cached_wallet_data.holdings")

	if 'cached_graph_data' not in data:
	issues.append("Missing cached_graph_data (v2)")
	else:
	graph_data = data['cached_graph_data']
	if 'entities' not in graph_data:
	issues.append("Missing cached_graph_data.entities")
	if 'links' not in graph_data:
	issues.append("Missing cached_graph_data.links")

	# Image is optional but good to have
	has_image = data.get('cached_image_bytes') is not None

	# Collect stats
	stats = {
	'num_trades': len(data.get('trades', [])),
	'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})),
	'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})),
	'has_image': has_image,
	'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None,
	}

	return {
	'valid': len(issues) == 0,
	'issues': issues,
	'stats': stats,
	'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data
	}

	except Exception as e:
	return {
	'valid': False,
	'issues': [f"Load error: {str(e)}"],
	'stats': {},
	'is_v2': False
	}


	def main():
	parser = argparse.ArgumentParser(description="Validate cache files for offline training")
	parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory")
	parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample")
	parser.add_argument("--full", action="store_true", help="Check all files (slow)")
	args = parser.parse_args()

	cache_dir = Path(args.cache_dir)
	if not cache_dir.exists():
	print(f"ERROR: Cache directory not found: {cache_dir}")
	return

	cache_files = list(cache_dir.glob("sample_*.pt"))
	print(f"Found {len(cache_files)} cache files")

	if not args.full and len(cache_files) > args.sample_size:
	cache_files = random.sample(cache_files, args.sample_size)
	print(f"Sampling {len(cache_files)} files for validation")

	# Validate files
	results = []
	for f in cache_files:
	result = validate_cache_file(f)
	result['filepath'] = f
	results.append(result)

	# Summary
	valid_count = sum(1 for r in results if r['valid'])
	v2_count = sum(1 for r in results if r['is_v2'])
	has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False))

	print("\n" + "="*60)
	print("VALIDATION SUMMARY")
	print("="*60)
	print(f" Total checked: {len(results)}")
	print(f" Valid: {valid_count} ({100*valid_count/len(results):.1f}%)")
	print(f" V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)")
	print(f" Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)")

	# Issue breakdown
	all_issues = []
	for r in results:
	all_issues.extend(r['issues'])

	if all_issues:
	print("\nIssue breakdown:")
	for issue, count in Counter(all_issues).most_common():
	print(f" {issue}: {count}")

	# Stats for valid v2 files
	v2_results = [r for r in results if r['is_v2']]
	if v2_results:
	avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results)
	avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results)
	print(f"\nV2 file stats:")
	print(f" Avg wallets cached: {avg_wallets:.1f}")
	print(f" Avg trades: {avg_trades:.1f}")

	# Show sample invalid files
	invalid_results = [r for r in results if not r['valid']]
	if invalid_results:
	print("\nSample invalid files:")
	for r in invalid_results[:5]:
	print(f" {r['filepath'].name}: {r['issues'][:2]}")

	# Recommendation
	print("\n" + "="*60)
	if v2_count == len(results):
	print("All files are V2 format. Ready for offline training!")
	elif v2_count == 0:
	print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir))
	else:
	print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.")
	print("Old format files will use empty wallet/graph data during training.")
	print("Run migrate_cache_v2.py to upgrade remaining files.")


	if __name__ == "__main__":
	main()