|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset |
|
""" |
|
|
|
import argparse |
|
import os |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("tsv") |
|
parser.add_argument("--output-dir", required=True) |
|
parser.add_argument("--output-name", required=True) |
|
args = parser.parse_args() |
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
transcriptions = {} |
|
|
|
with open(args.tsv, "r") as tsv, open( |
|
os.path.join(args.output_dir, args.output_name + ".ltr"), "w" |
|
) as ltr_out, open( |
|
os.path.join(args.output_dir, args.output_name + ".wrd"), "w" |
|
) as wrd_out: |
|
root = next(tsv).strip() |
|
for line in tsv: |
|
line = line.strip() |
|
dir = os.path.dirname(line) |
|
if dir not in transcriptions: |
|
parts = dir.split(os.path.sep) |
|
trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt" |
|
path = os.path.join(root, dir, trans_path) |
|
assert os.path.exists(path) |
|
texts = {} |
|
with open(path, "r") as trans_f: |
|
for tline in trans_f: |
|
items = tline.strip().split() |
|
texts[items[0]] = " ".join(items[1:]) |
|
transcriptions[dir] = texts |
|
part = os.path.basename(line).split(".")[0] |
|
assert part in transcriptions[dir] |
|
print(transcriptions[dir][part], file=wrd_out) |
|
print( |
|
" ".join(list(transcriptions[dir][part].replace(" ", "|"))) + " |", |
|
file=ltr_out, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|