|
|
|
|
| import argparse
|
| import sys
|
| from common import compare_tokens
|
|
|
|
|
| def parse_arguments():
|
| parser = argparse.ArgumentParser(
|
| description='Compare tokens between two models',
|
| formatter_class=argparse.RawDescriptionHelpFormatter,
|
| epilog="""
|
| Examples:
|
| %(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
|
| """
|
| )
|
| parser.add_argument(
|
| 'original',
|
| help='Original model name'
|
| )
|
| parser.add_argument(
|
| 'converted',
|
| help='Converted model name'
|
| )
|
| parser.add_argument(
|
| '-s', '--suffix',
|
| default='',
|
| help='Type suffix (e.g., "-embeddings")'
|
| )
|
| parser.add_argument(
|
| '-d', '--data-dir',
|
| default='data',
|
| help='Directory containing token files (default: data)'
|
| )
|
| parser.add_argument(
|
| '-v', '--verbose',
|
| action='store_true',
|
| help='Print prompts from both models'
|
| )
|
| return parser.parse_args()
|
|
|
|
|
| def main():
|
| args = parse_arguments()
|
|
|
| if args.verbose:
|
| from pathlib import Path
|
| data_dir = Path(args.data_dir)
|
|
|
| prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
|
| prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
|
|
|
| if prompt1_file.exists():
|
| print(f"\nOriginal model prompt ({args.original}):")
|
| print(f" {prompt1_file.read_text().strip()}")
|
|
|
| if prompt2_file.exists():
|
| print(f"\nConverted model prompt ({args.converted}):")
|
| print(f" {prompt2_file.read_text().strip()}")
|
|
|
| print()
|
|
|
| result = compare_tokens(
|
| args.original,
|
| args.converted,
|
| type_suffix=args.suffix,
|
| output_dir=args.data_dir
|
| )
|
|
|
|
|
|
|
| sys.exit(0 if result else 1)
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|