Student0809
/

interactSpeech

Model card Files Files and versions

interactSpeech / old /check.py

Student0809's picture

Add files using upload-large-folder tool

cb2428f verified 4 months ago

history blame contribute delete

2.24 kB

	import json
	import os
	import torchaudio
	import subprocess

	def check_audio_file(audio_path):
	# 检查文件是否存在
	if not os.path.exists(audio_path):
	print(f"[ERROR] 音频文件不存在: {audio_path}")
	return False
	# 尝试用torchaudio加载
	try:
	waveform, sr = torchaudio.load(audio_path)
	print(f"[OK] torchaudio加载成功: {audio_path}")
	print(f" 采样率: {sr}, 时长: {waveform.shape[1]/sr:.2f}s, 通道数: {waveform.shape[0]}")
	except Exception as e:
	print(f"[ERROR] torchaudio加载失败: {audio_path}, 错误: {e}")
	return False
	# 用sox/ffprobe获取详细信息
	try:
	sox_info = subprocess.check_output(['sox', '--i', audio_path], stderr=subprocess.STDOUT).decode()
	print(f" sox信息:\n{sox_info}")
	except Exception as e:
	print(f" [WARN] sox信息获取失败: {e}")
	return True

	def check_json_fields(obj):
	# 检查messages字段
	messages = obj.get("messages", [])
	for i, msg in enumerate(messages):
	content = msg.get("content", "")
	if not isinstance(content, str):
	print(f"[ERROR] messages[{i}].content 不是字符串")
	if len(content) > 2000:
	print(f"[WARN] messages[{i}].content 超长: {len(content)} 字符")
	if any(ord(c) < 32 and c not in '\n\r\t' for c in content):
	print(f"[WARN] messages[{i}].content 含有不可见字符")
	# 检查solution字段
	if "solution" not in obj:
	print("[WARN] 缺少 solution 字段")
	return True

	def main():
	jsonl_path = "dataset_10k_train.jsonl" # 替换为你的文件
	with open(jsonl_path, "r", encoding="utf-8") as f:
	for idx, line in enumerate(f):
	print(f"\n==== 检查第 {idx+1} 条数据 ====")
	try:
	obj = json.loads(line)
	except Exception as e:
	print(f"[ERROR] JSON解析失败: {e}")
	continue
	check_json_fields(obj)
	audios = obj.get("audios", [])
	for audio_path in audios:
	check_audio_file(audio_path)
	print("==== 检查结束 ====")

	if __name__ == "__main__":
	main()