| | |
| | """ |
| | Let's understand the relationship between the datasets by comparing a few records. |
| | """ |
| | import csv |
| | import json |
| |
|
| | |
| | print("=== function_dataset_v2.csv structure ===") |
| | with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: |
| | reader = csv.DictReader(f) |
| | headers = reader.fieldnames |
| | print(f"Headers: {headers}") |
| | |
| | |
| | print("\nFinding a row with complete metadata...") |
| | for row in reader: |
| | if row['repo_name'] and row['path'] and row['language']: |
| | print(f"\nSample row WITH metadata:") |
| | print(f" original_index: {row['original_index']}") |
| | print(f" function_index: {row['function_index']}") |
| | print(f" repo_name: {row['repo_name']}") |
| | print(f" path: {row['path']}") |
| | print(f" language: {row['language']}") |
| | print(f" function_name: {row['function_name']}") |
| | break |
| |
|
| | |
| | print("\n\n=== programming_problems.jsonl structure ===") |
| | with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: |
| | |
| | for line in f: |
| | data = json.loads(line.strip()) |
| | |
| | print(f"First entry:") |
| | print(f" row_number: {data.get('row_number')}") |
| | print(f" metadata.original_index: {data['metadata']['original_index']}") |
| | print(f" metadata.function_name: {data['metadata']['function_name']}") |
| | print(f" metadata.repo_name: '{data['metadata']['repo_name']}'") |
| | print(f" metadata.path: '{data['metadata']['path']}'") |
| | print(f" metadata.language: '{data['metadata']['language']}'") |
| | break |
| |
|
| | |
| | print("\n\n=== Checking if row_number matches CSV row ===") |
| | with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: |
| | data = json.loads(f.readline()) |
| | target_row = data.get('row_number') |
| | print(f"JSONL row_number: {target_row}") |
| |
|
| | |
| | with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: |
| | reader = csv.DictReader(f) |
| | for i, row in enumerate(reader): |
| | if i + 1 == target_row: |
| | print(f"\nCSV row {target_row}:") |
| | print(f" original_index: {row['original_index']}") |
| | print(f" repo_name: '{row['repo_name']}'") |
| | print(f" path: '{row['path']}'") |
| | print(f" language: '{row['language']}'") |
| | print(f" function_name: '{row['function_name']}'") |
| | |
| | |
| | if row['function_name'] == data['metadata']['function_name']: |
| | print(f"\n✅ Function names match! We should use row_number as the key.") |
| | else: |
| | print(f"\n❌ Function names don't match.") |
| | break |
| |
|