interactSpeech / old /checkJSONl.py
Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
import json
import re
from transformers import AutoTokenizer
# 配置部分👇
dataset_path = "all_dataset_train.jsonl" # 你的数据文件路径
model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" # 用于加载tokenizer的模型
required_fields = ["input", "output"] # 必须字段
max_token_length = 8192 # 最大允许token数量(你可以按模型修改)
# 加载 tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 控制字符检查函数
def has_control_chars(text):
return bool(re.search(r"[\x00-\x1F\x7F]", text))
# 开始逐行检查
print("Checking dataset...\n")
with open(dataset_path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f, 1):
try:
data = json.loads(line)
except json.JSONDecodeError as e:
print(f"[Line {idx}] ❌ JSON decode error: {e}")
continue
# 检查字段完整性
for field in required_fields:
if field not in data:
print(f"[Line {idx}] ❌ Missing required field: '{field}'")
elif not data[field].strip():
print(f"[Line {idx}] ❌ Field '{field}' is empty")
# 控制字符检查
input_text = data.get("input", "")
output_text = data.get("output", "")
if has_control_chars(input_text + output_text):
print(f"[Line {idx}] ⚠️ Contains control characters")
# Token 长度检查
try:
tokens = tokenizer(input_text + output_text, return_tensors="pt")
token_len = tokens["input_ids"].shape[1]
if token_len > max_token_length:
print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}")
except Exception as e:
print(f"[Line {idx}] ❌ Tokenization error: {e}")
print("\n✅ Dataset check complete.")