Student0809
/

interactSpeech

Model card Files Files and versions

interactSpeech / old /checkJSONl.py

Student0809's picture

Add files using upload-large-folder tool

cb2428f verified 4 months ago

history blame contribute delete

1.97 kB

	import json
	import re
	from transformers import AutoTokenizer

	# 配置部分👇
	dataset_path = "all_dataset_train.jsonl" # 你的数据文件路径
	model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" # 用于加载tokenizer的模型
	required_fields = ["input", "output"] # 必须字段
	max_token_length = 8192 # 最大允许token数量（你可以按模型修改）

	# 加载 tokenizer
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# 控制字符检查函数
	def has_control_chars(text):
	return bool(re.search(r"[\x00-\x1F\x7F]", text))

	# 开始逐行检查
	print("Checking dataset...\n")
	with open(dataset_path, "r", encoding="utf-8") as f:
	for idx, line in enumerate(f, 1):
	try:
	data = json.loads(line)
	except json.JSONDecodeError as e:
	print(f"[Line {idx}] ❌ JSON decode error: {e}")
	continue

	# 检查字段完整性
	for field in required_fields:
	if field not in data:
	print(f"[Line {idx}] ❌ Missing required field: '{field}'")
	elif not data[field].strip():
	print(f"[Line {idx}] ❌ Field '{field}' is empty")

	# 控制字符检查
	input_text = data.get("input", "")
	output_text = data.get("output", "")
	if has_control_chars(input_text + output_text):
	print(f"[Line {idx}] ⚠️ Contains control characters")

	# Token 长度检查
	try:
	tokens = tokenizer(input_text + output_text, return_tensors="pt")
	token_len = tokens["input_ids"].shape[1]
	if token_len > max_token_length:
	print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}")
	except Exception as e:
	print(f"[Line {idx}] ❌ Tokenization error: {e}")

	print("\n✅ Dataset check complete.")