Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test script to verify OpenHermes-FR dataset loading | |
| """ | |
| from datasets import load_dataset | |
| import json | |
| import random | |
| def test_openhermes_fr(): | |
| """Test loading and processing OpenHermes-FR dataset""" | |
| print("Loading OpenHermes-FR dataset...") | |
| try: | |
| dataset = load_dataset('legmlai/openhermes-fr') | |
| print(f"β Dataset loaded successfully") | |
| print(f" Train samples: {len(dataset['train'])}") | |
| if 'validation' in dataset: | |
| print(f" Validation samples: {len(dataset['validation'])}") | |
| # Show sample structure | |
| sample = dataset['train'][0] | |
| print(f"\nπ Sample structure:") | |
| for key, value in sample.items(): | |
| if isinstance(value, str) and len(value) > 100: | |
| print(f" {key}: {value[:100]}...") | |
| else: | |
| print(f" {key}: {value}") | |
| # Test conversion | |
| print(f"\nπ Testing conversion...") | |
| def convert_to_training_format(example): | |
| # Handle OpenHermes-FR format specifically | |
| if 'prompt' in example and 'accepted_completion' in example: | |
| return { | |
| 'prompt': example['prompt'], | |
| 'completion': example['accepted_completion'] | |
| } | |
| elif 'prompt' in example and 'completion' in example: | |
| return { | |
| 'prompt': example['prompt'], | |
| 'completion': example['completion'] | |
| } | |
| else: | |
| return None | |
| # Process first 10 examples | |
| train_data = [] | |
| for i, example in enumerate(dataset['train'][:10]): | |
| training_example = convert_to_training_format(example) | |
| if training_example and training_example['prompt'] and training_example['completion']: | |
| # Filter out bad entries | |
| if 'bad_entry' in example and example['bad_entry']: | |
| print(f" Skipping bad entry {i}") | |
| continue | |
| train_data.append(training_example) | |
| print(f" β Converted example {i}") | |
| print(f"\nπ Conversion results:") | |
| print(f" Converted: {len(train_data)} valid examples") | |
| if train_data: | |
| print(f"\nπ Sample converted example:") | |
| sample = train_data[0] | |
| print(f" Prompt: {sample['prompt'][:100]}...") | |
| print(f" Completion: {sample['completion'][:100]}...") | |
| # Test sampling | |
| if len(dataset['train']) > 100: | |
| print(f"\nπ² Testing sampling...") | |
| random.seed(42) | |
| sampled_indices = random.sample(range(len(dataset['train'])), 5) | |
| print(f" Sampled indices: {sampled_indices}") | |
| return True | |
| except Exception as e: | |
| print(f"β Error loading dataset: {e}") | |
| return False | |
| if __name__ == "__main__": | |
| success = test_openhermes_fr() | |
| if success: | |
| print("\nβ Dataset test completed successfully!") | |
| else: | |
| print("\nβ Dataset test failed!") | |
| exit(1) |