|
|
import json |
|
|
import os |
|
|
import torchaudio |
|
|
import subprocess |
|
|
|
|
|
def check_audio_file(audio_path): |
|
|
|
|
|
if not os.path.exists(audio_path): |
|
|
print(f"[ERROR] 音频文件不存在: {audio_path}") |
|
|
return False |
|
|
|
|
|
try: |
|
|
waveform, sr = torchaudio.load(audio_path) |
|
|
print(f"[OK] torchaudio加载成功: {audio_path}") |
|
|
print(f" 采样率: {sr}, 时长: {waveform.shape[1]/sr:.2f}s, 通道数: {waveform.shape[0]}") |
|
|
except Exception as e: |
|
|
print(f"[ERROR] torchaudio加载失败: {audio_path}, 错误: {e}") |
|
|
return False |
|
|
|
|
|
try: |
|
|
sox_info = subprocess.check_output(['sox', '--i', audio_path], stderr=subprocess.STDOUT).decode() |
|
|
print(f" sox信息:\n{sox_info}") |
|
|
except Exception as e: |
|
|
print(f" [WARN] sox信息获取失败: {e}") |
|
|
return True |
|
|
|
|
|
def check_json_fields(obj): |
|
|
|
|
|
messages = obj.get("messages", []) |
|
|
for i, msg in enumerate(messages): |
|
|
content = msg.get("content", "") |
|
|
if not isinstance(content, str): |
|
|
print(f"[ERROR] messages[{i}].content 不是字符串") |
|
|
if len(content) > 2000: |
|
|
print(f"[WARN] messages[{i}].content 超长: {len(content)} 字符") |
|
|
if any(ord(c) < 32 and c not in '\n\r\t' for c in content): |
|
|
print(f"[WARN] messages[{i}].content 含有不可见字符") |
|
|
|
|
|
if "solution" not in obj: |
|
|
print("[WARN] 缺少 solution 字段") |
|
|
return True |
|
|
|
|
|
def main(): |
|
|
jsonl_path = "dataset_10k_train.jsonl" |
|
|
with open(jsonl_path, "r", encoding="utf-8") as f: |
|
|
for idx, line in enumerate(f): |
|
|
print(f"\n==== 检查第 {idx+1} 条数据 ====") |
|
|
try: |
|
|
obj = json.loads(line) |
|
|
except Exception as e: |
|
|
print(f"[ERROR] JSON解析失败: {e}") |
|
|
continue |
|
|
check_json_fields(obj) |
|
|
audios = obj.get("audios", []) |
|
|
for audio_path in audios: |
|
|
check_audio_file(audio_path) |
|
|
print("==== 检查结束 ====") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |