|
|
import json |
|
|
import re |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
dataset_path = "all_dataset_train.jsonl" |
|
|
model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" |
|
|
required_fields = ["input", "output"] |
|
|
max_token_length = 8192 |
|
|
|
|
|
|
|
|
print("Loading tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
|
|
|
|
|
|
def has_control_chars(text): |
|
|
return bool(re.search(r"[\x00-\x1F\x7F]", text)) |
|
|
|
|
|
|
|
|
print("Checking dataset...\n") |
|
|
with open(dataset_path, "r", encoding="utf-8") as f: |
|
|
for idx, line in enumerate(f, 1): |
|
|
try: |
|
|
data = json.loads(line) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"[Line {idx}] ❌ JSON decode error: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
for field in required_fields: |
|
|
if field not in data: |
|
|
print(f"[Line {idx}] ❌ Missing required field: '{field}'") |
|
|
elif not data[field].strip(): |
|
|
print(f"[Line {idx}] ❌ Field '{field}' is empty") |
|
|
|
|
|
|
|
|
input_text = data.get("input", "") |
|
|
output_text = data.get("output", "") |
|
|
if has_control_chars(input_text + output_text): |
|
|
print(f"[Line {idx}] ⚠️ Contains control characters") |
|
|
|
|
|
|
|
|
try: |
|
|
tokens = tokenizer(input_text + output_text, return_tensors="pt") |
|
|
token_len = tokens["input_ids"].shape[1] |
|
|
if token_len > max_token_length: |
|
|
print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}") |
|
|
except Exception as e: |
|
|
print(f"[Line {idx}] ❌ Tokenization error: {e}") |
|
|
|
|
|
print("\n✅ Dataset check complete.") |
|
|
|