Spaces:
Running
Running
File size: 2,596 Bytes
769bb84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
#!/usr/bin/env python3
"""
Test script to verify dataset loading works correctly
"""
import os
import sys
import json
from datasets import load_dataset
def test_dataset_loading():
"""Test loading the OpenHermes-FR dataset"""
print("Testing dataset loading...")
try:
# Load the dataset
dataset = load_dataset("legmlai/openhermes-fr")
print(f"✅ Dataset loaded successfully")
print(f" Train samples: {len(dataset['train'])}")
# Check the first few examples
print("\nSample examples:")
for i in range(min(3, len(dataset['train'])):
example = dataset['train'][i]
print(f"\nExample {i+1}:")
print(f" Keys: {list(example.keys())}")
print(f" Prompt: {example.get('prompt', 'N/A')[:100]}...")
print(f" Accepted completion: {example.get('accepted_completion', 'N/A')[:100]}...")
print(f" Bad entry: {example.get('bad_entry', 'N/A')}")
# Test filtering bad entries
print(f"\nFiltering bad entries...")
original_size = len(dataset['train'])
filtered_dataset = dataset['train'].filter(lambda x: not x.get('bad_entry', False))
filtered_size = len(filtered_dataset)
print(f" Original size: {original_size}")
print(f" Filtered size: {filtered_size}")
print(f" Removed: {original_size - filtered_size} bad entries")
# Test conversion to training format
print(f"\nTesting conversion to training format...")
train_data = []
for i, example in enumerate(filtered_dataset):
if i >= 5: # Just test first 5 examples
break
if 'prompt' in example and 'accepted_completion' in example:
train_data.append({
'prompt': example['prompt'],
'completion': example['accepted_completion']
})
print(f" Converted {len(train_data)} examples to training format")
# Save a small sample
os.makedirs('test_dataset', exist_ok=True)
with open('test_dataset/train.json', 'w') as f:
json.dump(train_data, f, indent=2)
print(f"✅ Test completed successfully!")
print(f" Sample saved to: test_dataset/train.json")
return True
except Exception as e:
print(f"❌ Dataset loading failed: {e}")
return False
if __name__ == "__main__":
success = test_dataset_loading()
sys.exit(0 if success else 1) |