interactSpeech / old /check.py
Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
import json
import os
import torchaudio
import subprocess
def check_audio_file(audio_path):
# 检查文件是否存在
if not os.path.exists(audio_path):
print(f"[ERROR] 音频文件不存在: {audio_path}")
return False
# 尝试用torchaudio加载
try:
waveform, sr = torchaudio.load(audio_path)
print(f"[OK] torchaudio加载成功: {audio_path}")
print(f" 采样率: {sr}, 时长: {waveform.shape[1]/sr:.2f}s, 通道数: {waveform.shape[0]}")
except Exception as e:
print(f"[ERROR] torchaudio加载失败: {audio_path}, 错误: {e}")
return False
# 用sox/ffprobe获取详细信息
try:
sox_info = subprocess.check_output(['sox', '--i', audio_path], stderr=subprocess.STDOUT).decode()
print(f" sox信息:\n{sox_info}")
except Exception as e:
print(f" [WARN] sox信息获取失败: {e}")
return True
def check_json_fields(obj):
# 检查messages字段
messages = obj.get("messages", [])
for i, msg in enumerate(messages):
content = msg.get("content", "")
if not isinstance(content, str):
print(f"[ERROR] messages[{i}].content 不是字符串")
if len(content) > 2000:
print(f"[WARN] messages[{i}].content 超长: {len(content)} 字符")
if any(ord(c) < 32 and c not in '\n\r\t' for c in content):
print(f"[WARN] messages[{i}].content 含有不可见字符")
# 检查solution字段
if "solution" not in obj:
print("[WARN] 缺少 solution 字段")
return True
def main():
jsonl_path = "dataset_10k_train.jsonl" # 替换为你的文件
with open(jsonl_path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
print(f"\n==== 检查第 {idx+1} 条数据 ====")
try:
obj = json.loads(line)
except Exception as e:
print(f"[ERROR] JSON解析失败: {e}")
continue
check_json_fields(obj)
audios = obj.get("audios", [])
for audio_path in audios:
check_audio_file(audio_path)
print("==== 检查结束 ====")
if __name__ == "__main__":
main()