Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test script to verify dataset loading works correctly | |
""" | |
import os | |
import sys | |
import json | |
from datasets import load_dataset | |
def test_dataset_loading(): | |
"""Test loading the OpenHermes-FR dataset""" | |
print("Testing dataset loading...") | |
try: | |
# Load the dataset | |
dataset = load_dataset("legmlai/openhermes-fr") | |
print(f"β Dataset loaded successfully") | |
print(f" Train samples: {len(dataset['train'])}") | |
# Check the first few examples | |
print("\nSample examples:") | |
for i in range(min(3, len(dataset['train'])): | |
example = dataset['train'][i] | |
print(f"\nExample {i+1}:") | |
print(f" Keys: {list(example.keys())}") | |
print(f" Prompt: {example.get('prompt', 'N/A')[:100]}...") | |
print(f" Accepted completion: {example.get('accepted_completion', 'N/A')[:100]}...") | |
print(f" Bad entry: {example.get('bad_entry', 'N/A')}") | |
# Test filtering bad entries | |
print(f"\nFiltering bad entries...") | |
original_size = len(dataset['train']) | |
filtered_dataset = dataset['train'].filter(lambda x: not x.get('bad_entry', False)) | |
filtered_size = len(filtered_dataset) | |
print(f" Original size: {original_size}") | |
print(f" Filtered size: {filtered_size}") | |
print(f" Removed: {original_size - filtered_size} bad entries") | |
# Test conversion to training format | |
print(f"\nTesting conversion to training format...") | |
train_data = [] | |
for i, example in enumerate(filtered_dataset): | |
if i >= 5: # Just test first 5 examples | |
break | |
if 'prompt' in example and 'accepted_completion' in example: | |
train_data.append({ | |
'prompt': example['prompt'], | |
'completion': example['accepted_completion'] | |
}) | |
print(f" Converted {len(train_data)} examples to training format") | |
# Save a small sample | |
os.makedirs('test_dataset', exist_ok=True) | |
with open('test_dataset/train.json', 'w') as f: | |
json.dump(train_data, f, indent=2) | |
print(f"β Test completed successfully!") | |
print(f" Sample saved to: test_dataset/train.json") | |
return True | |
except Exception as e: | |
print(f"β Dataset loading failed: {e}") | |
return False | |
if __name__ == "__main__": | |
success = test_dataset_loading() | |
sys.exit(0 if success else 1) |