File size: 2,213 Bytes
37254e5
2e00f3e
 
 
fc09a10
2e00f3e
 
 
 
 
 
37254e5
75cb74f
3ece66a
75cb74f
3ece66a
75cb74f
 
2e00f3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ece66a
edf267e
2e00f3e
 
 
38e621e
edf267e
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr

import argparse
import soundfile as sf
import numpy as np
import tempfile
from pathlib import Path
import os
import subprocess
import sys
import re

# from transformers import AutoProcessor, AutoModelForPreTraining

# processor = AutoProcessor.from_pretrained("patrickvonplaten/mms-1b")

# model = AutoModelForPreTraining.from_pretrained("patrickvonplaten/mms-1b")

def process(audio, model, lang, format):    
    with tempfile.TemporaryDirectory() as tmpdir:
        print(">>> preparing tmp manifest dir ...", file=sys.stderr)
        tmpdir = Path(tmpdir)
        with open(tmpdir / "dev.tsv", "w") as fw:
            fw.write("/\n")
            for audio in audio:
                nsample = sf.SoundFile(audio).frames
                fw.write(f"{audio}\t{nsample}\n")
        with open(tmpdir / "dev.uid", "w") as fw:
            fw.write(f"{audio}\n"*len(audio))
        with open(tmpdir / "dev.ltr", "w") as fw:
            fw.write("d u m m y | d u m m y\n"*len(audio))
        with open(tmpdir / "dev.wrd", "w") as fw:
            fw.write("dummy dummy\n"*len(audio))
        cmd = f"""
        PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python infer.py -m decoding.type=viterbi dataset.max_tokens=4000000 distributed_training.distributed_world_size=1 "common_eval.path='{model}'" task.data={tmpdir} dataset.gen_subset="{lang}:dev" common_eval.post_process={format} decoding.results_path={tmpdir}
        """
        print(">>> loading model & running inference ...", file=sys.stderr)
        subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL,)
        with open(tmpdir/"hypo.word") as fr:
            for ii, hypo in enumerate(fr):
                hypo = re.sub("\(\S+\)$", "", hypo).strip()
                print(f'===============\nInput: {audio[ii]}\nOutput: {hypo}')

def transcribe(audio):
    model = "base_300m.pt"
    lang = "eng"
    format = "letter"
    process(np.ravel(audio), model, lang, format)

gr.Interface(
    title = 'MetaAI (Facebook Research) MMS (Massively Multilingual Speech) ASR', 
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath")
    ],
    outputs=[
        "textbox"
    ],
    live=True).launch()