|
|
import os |
|
|
import csv |
|
|
import wave |
|
|
import re |
|
|
import json |
|
|
from pathlib import Path |
|
|
import subprocess |
|
|
from subprocess import CompletedProcess |
|
|
|
|
|
def add_text_index(): |
|
|
text_file = '../test_data/recordings/text/test_asr_zh.txt' |
|
|
index = 1 |
|
|
with open(text_file, encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
|
|
|
if not line: |
|
|
continue |
|
|
if line.startswith('#'): |
|
|
|
|
|
continue |
|
|
line = f"{index}. {line}" |
|
|
print(line) |
|
|
index += 1 |
|
|
|
|
|
def get_lines_with_index(filepath): |
|
|
with open(filepath, encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
m = re.match(r'^(\d+)\.\s*(.*)', line) |
|
|
if m: |
|
|
yield m.group(1), m.group(2) |
|
|
|
|
|
def get_wav_length(wav_path): |
|
|
try: |
|
|
with wave.open(wav_path, 'rb') as wf: |
|
|
frames = wf.getnframes() |
|
|
rate = wf.getframerate() |
|
|
duration = frames / float(rate) |
|
|
return duration |
|
|
except Exception as e: |
|
|
print(f"Error reading {wav_path}: {e}") |
|
|
return 0 |
|
|
|
|
|
def write_csv(rows, output_csv): |
|
|
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: |
|
|
writer = csv.writer(csvfile) |
|
|
writer.writerow(['序号', '文本', '音频长度(秒)']) |
|
|
writer.writerows(rows) |
|
|
|
|
|
def print_text_and_audio_length(): |
|
|
text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' |
|
|
audio_folder = '../test_data/recordings' |
|
|
output_csv = 'csv/text_audio_length.csv' |
|
|
rows = [] |
|
|
for idx, text in get_lines_with_index(text_file): |
|
|
|
|
|
|
|
|
audio_path = os.path.join(audio_folder, f"{idx}.wav") |
|
|
audio_length = get_wav_length(audio_path) |
|
|
audio_length = round(audio_length, 2) if audio_length is not None else None |
|
|
|
|
|
rows.append([idx, text, round(audio_length,2)]) |
|
|
write_csv(rows, output_csv) |
|
|
|
|
|
def get_text_distance(text1, text2): |
|
|
from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff |
|
|
text1_clean = clean_text_for_comparison_zh(text1) |
|
|
text2_clean = clean_text_for_comparison_zh(text2) |
|
|
d, nd = run_textdistance(text1_clean, text2_clean) |
|
|
diff = highlight_diff(text1_clean, text2_clean, spliter="") |
|
|
return d, nd, diff |
|
|
|
|
|
def get_origin_text_dict(): |
|
|
text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' |
|
|
text_dict = {} |
|
|
for idx, text in get_lines_with_index(text_file): |
|
|
text_dict[idx] = text |
|
|
return text_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
print_text_and_audio_length() |
|
|
|