| |
| """ |
| Check batch5 data quality and format. |
| """ |
|
|
| import json |
| from collections import Counter |
|
|
| def check_conversation_format(conversations): |
| """Check if conversation follows the correct format.""" |
| issues = [] |
|
|
| |
| roles = [conv['from'] for conv in conversations] |
|
|
| |
| if roles and roles[0] != 'human': |
| issues.append(f"Does not start with 'human', starts with '{roles[0]}'") |
|
|
| |
| for i, role in enumerate(roles): |
| if role not in ['human', 'gpt', 'function_call', 'observation', 'system']: |
| issues.append(f"Invalid role '{role}' at position {i}") |
|
|
| |
| has_function_call = 'function_call' in roles |
| has_observation = 'observation' in roles |
|
|
| |
| if has_function_call: |
| for i, role in enumerate(roles): |
| if role == 'function_call': |
| if i + 1 >= len(roles): |
| issues.append("function_call is last message (missing response)") |
| elif roles[i + 1] not in ['observation', 'gpt']: |
| issues.append(f"function_call followed by '{roles[i + 1]}' instead of observation/gpt") |
|
|
| return issues, has_function_call, has_observation |
|
|
| def main(): |
| print("Loading batch5...") |
| with open('data/dolci_10k_with_tool_call_batch5.json', 'r', encoding='utf-8') as f: |
| batch5 = json.load(f) |
|
|
| print(f"Total samples: {len(batch5)}") |
|
|
| |
| role_counter = Counter() |
| samples_with_tools = 0 |
| samples_with_system = 0 |
| conversation_length = [] |
| issues_found = [] |
|
|
| print("\nAnalyzing samples...") |
| for idx, sample in enumerate(batch5): |
| conversations = sample.get('conversations', []) |
|
|
| |
| for conv in conversations: |
| role_counter[conv['from']] += 1 |
|
|
| |
| if 'tools' in sample and sample['tools']: |
| samples_with_tools += 1 |
| if 'system' in sample and sample['system']: |
| samples_with_system += 1 |
|
|
| conversation_length.append(len(conversations)) |
|
|
| |
| issues, has_fc, has_obs = check_conversation_format(conversations) |
| if issues: |
| issues_found.append({ |
| 'index': idx, |
| 'issues': issues, |
| 'roles': [c['from'] for c in conversations] |
| }) |
|
|
| |
| print(f"\n=== Statistics ===") |
| print(f"Samples with 'tools' field: {samples_with_tools}") |
| print(f"Samples with 'system' field: {samples_with_system}") |
| print(f"Average conversation length: {sum(conversation_length) / len(conversation_length):.2f}") |
| print(f"Min conversation length: {min(conversation_length)}") |
| print(f"Max conversation length: {max(conversation_length)}") |
|
|
| print(f"\n=== Role Distribution ===") |
| for role, count in role_counter.most_common(): |
| print(f" {role}: {count}") |
|
|
| print(f"\n=== Issues Found ===") |
| print(f"Total samples with issues: {len(issues_found)}") |
|
|
| if issues_found: |
| print(f"\nFirst 10 issues:") |
| for item in issues_found[:10]: |
| print(f"\nSample {item['index']}:") |
| print(f" Roles: {item['roles']}") |
| for issue in item['issues']: |
| print(f" - {issue}") |
|
|
| |
| print(f"\n=== Sample Entries ===") |
| for i in [0, 100, 500]: |
| if i < len(batch5): |
| print(f"\n--- Sample {i} ---") |
| sample = batch5[i] |
| print(f"Has tools: {'tools' in sample}") |
| print(f"Has system: {'system' in sample}") |
| print(f"Conversation roles: {[c['from'] for c in sample['conversations']]}") |
| print(f"First message: {sample['conversations'][0]['value'][:100]}...") |
| if len(sample['conversations']) > 1: |
| print(f"Second message from: {sample['conversations'][1]['from']}") |
| print(f"Second message: {sample['conversations'][1]['value'][:100]}...") |
|
|
| if __name__ == "__main__": |
| main() |
|
|