seamless-streaming

Running on T4

App Files Files Community

radames HF staff commited on Nov 28, 2023

Commit

4a67689

•

1 Parent(s): 3e8007d

cleanup

Browse files

Files changed (10) hide show

app.py +0 -215
assets/sample_input.mp3 +0 -3
assets/sample_input_2.mp3 +0 -3
lang_list.py +0 -254
m4t_app.py +0 -463
models/vad_s2st_sc_24khz_main.yaml +0 -24
requirements.txt +0 -26
sample_wav.py +0 -0
simuleval_transcoder.py +0 -425
style.css +0 -16

app.py DELETED Viewed

@@ -1,215 +0,0 @@
-from __future__ import annotations
-import gradio as gr
-import numpy as np
-import asyncio
-from simuleval_transcoder import SimulevalTranscoder, logger
-import time
-from simuleval.utils.agent import build_system_from_dir
-import torch
-language_code_to_name = {
-    "cmn": "Mandarin Chinese",
-    "deu": "German",
-    "eng": "English",
-    "fra": "French",
-    "spa": "Spanish",
-}
-S2ST_TARGET_LANGUAGE_NAMES = language_code_to_name.values()
-LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
-DEFAULT_TARGET_LANGUAGE = "English"
-def build_agent(model_path, config_name=None):
-    agent = build_system_from_dir(
-        model_path, config_name=config_name,
-    )
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    agent.to(device, fp16=True)
-    return agent
-agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
-transcoder = SimulevalTranscoder(
-    agent,
-    sample_rate=48_000,
-    debug=False,
-    buffer_limit=1,
-)
-def start_recording():
-    logger.debug(f"start_recording: starting transcoder")
-    transcoder.reset_states()
-    transcoder.close = False
-    transcoder.start()
-def stop_recording():
-    transcoder.close = True
-class MyState:
-    def __init__(self):
-        self.queue = asyncio.Queue()
-        self.close = False
-s = MyState()
-def process_incoming_bytes(audio):
-    logger.debug(f"process_bytes: incoming audio")
-    sample_rate, data = audio
-    transcoder.process_incoming_bytes(data.tobytes(), 'eng', sample_rate)
-    s.queue.put_nowait(audio)
-def get_buffered_output():
-    speech_and_text_output =  transcoder.get_buffered_output()
-    if speech_and_text_output is None:
-        logger.debug("No output from transcoder.get_buffered_output()")
-        return None, None, None
-    logger.debug(f"We DID get output from the transcoder!")
-    text = None
-    speech = None
-    if speech_and_text_output.speech_samples:
-        speech = (speech_and_text_output.speech_sample_rate, speech_and_text_output.speech_samples)
-    if speech_and_text_output.text:
-        text = speech_and_text_output.text
-        if speech_and_text_output.final:
-            text += "\n"
-    return speech, text, speech_and_text_output.final
-from scipy.io.wavfile import write as scipy_write
-def streaming_input_callback():
-    final = False
-    max_wait_s = 15
-    wait_s = 0
-    translated_text_state = ""
-    sample_rate = 24000
-    while not transcoder.close:
-        translated_wav_segment, translated_text, final = get_buffered_output()
-        if translated_wav_segment is None and translated_text is None:
-            time.sleep(0.3)
-            wait_s += 0.3
-            if wait_s >= max_wait_s:
-                transcoder.close = True
-            continue
-        wait_s = 0
-        if translated_wav_segment is not None:
-            sample_rate, audio_bytes = translated_wav_segment
-            print("output sample rate", sample_rate)
-            translated_wav_segment = sample_rate, np.array(audio_bytes)
-        else:
-            translated_wav_segment = sample_rate, np.empty(0, dtype=np.int16)
-        if translated_text is not None:
-            translated_text_state += " | " + str(translated_text)
-        stream_output_text = translated_text_state
-        if translated_text is not None:
-            print("translated:", translated_text_state)
-        yield [
-            translated_wav_segment,
-            stream_output_text,
-            translated_text_state,
-        ]
-def streaming_callback_dummy():
-    i = 0
-    out_text = ""
-    while not transcoder.close:
-        if s.queue.empty():
-            yield (
-                (48000, np.empty(0, dtype=np.int16)), out_text, out_text
-            )
-            time.sleep(0.3)
-        else:
-            i += 1
-            out_text += " | " + str(i)
-            print(out_text)
-            audio = s.queue.get_nowait()
-            if i == 0:
-                print(audio[0], type(audio[1]))
-            s.queue.task_done()
-            yield audio, out_text, out_text
-def clear():
-    logger.debug(f"Clearing State")
-    return [bytes(), ""]
-def blocks():
-    with gr.Blocks() as demo:
-        with gr.Row():
-            # TODO: add target language switching
-            target_language = gr.Dropdown(
-                label="Target language",
-                choices=S2ST_TARGET_LANGUAGE_NAMES,
-                value=DEFAULT_TARGET_LANGUAGE,
-            )
-        translated_text_state = gr.State("")
-        input_audio = gr.Audio(
-            label="Input Audio",
-            sources=["microphone"],
-            streaming=True,
-        )
-        output_translation_segment = gr.Audio(
-            label="Translated audio segment",
-            autoplay=True,
-            streaming=True,
-        )
-        # Output text segment
-        stream_output_text = gr.Textbox(label="Translated text")
-        input_audio.clear(
-            clear, None, [output_translation_segment, translated_text_state]
-        )
-        input_audio.start_recording(
-            clear, None, [output_translation_segment, translated_text_state]
-        ).then(
-            start_recording
-        ).then(
-            # TODO: streaming speech autoplay works fine with streaming_callback_dummy,
-            # but speech output from streaming_input_callback has a huge delay
-            # when comparing print/debugging logs vs. output speech
-            # TODO: text output works fine with one output, but is not
-            # updating when output is both text + speech
-            # streaming_callback_dummy,
-            streaming_input_callback,
-            None,
-            [
-                output_translation_segment,
-                stream_output_text,
-                translated_text_state,
-            ]
-        )
-        input_audio.stop_recording(
-            stop_recording
-        )
-        input_audio.stream(
-            # TODO: *only when streaming speech output* about half the time
-            # there is some race condition in gradio where process_incoming_bytes
-            # stops getting called once the first speech chunk is yield-ed
-            # in streaming_input_callback (or streaming_callback_dummy)
-            process_incoming_bytes, [input_audio], None
-        )
-    demo.launch(server_port=6010)
-blocks()

assets/sample_input.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:982369687f05bf8fcd6923c4ffcccda0fcce92f44eceae5a9d00a431f07ea87b
-size 10272

assets/sample_input_2.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6a505a4641e3f5f0ddec9508832793aa20e63d2545530b66bc04a9bd19a742e6
-size 30624

lang_list.py DELETED Viewed

@@ -1,254 +0,0 @@
-# Language dict
-language_code_to_name = {
-    "afr": "Afrikaans",
-    "amh": "Amharic",
-    "arb": "Modern Standard Arabic",
-    "ary": "Moroccan Arabic",
-    "arz": "Egyptian Arabic",
-    "asm": "Assamese",
-    "ast": "Asturian",
-    "azj": "North Azerbaijani",
-    "bel": "Belarusian",
-    "ben": "Bengali",
-    "bos": "Bosnian",
-    "bul": "Bulgarian",
-    "cat": "Catalan",
-    "ceb": "Cebuano",
-    "ces": "Czech",
-    "ckb": "Central Kurdish",
-    "cmn": "Mandarin Chinese",
-    "cym": "Welsh",
-    "dan": "Danish",
-    "deu": "German",
-    "ell": "Greek",
-    "eng": "English",
-    "est": "Estonian",
-    "eus": "Basque",
-    "fin": "Finnish",
-    "fra": "French",
-    "gaz": "West Central Oromo",
-    "gle": "Irish",
-    "glg": "Galician",
-    "guj": "Gujarati",
-    "heb": "Hebrew",
-    "hin": "Hindi",
-    "hrv": "Croatian",
-    "hun": "Hungarian",
-    "hye": "Armenian",
-    "ibo": "Igbo",
-    "ind": "Indonesian",
-    "isl": "Icelandic",
-    "ita": "Italian",
-    "jav": "Javanese",
-    "jpn": "Japanese",
-    "kam": "Kamba",
-    "kan": "Kannada",
-    "kat": "Georgian",
-    "kaz": "Kazakh",
-    "kea": "Kabuverdianu",
-    "khk": "Halh Mongolian",
-    "khm": "Khmer",
-    "kir": "Kyrgyz",
-    "kor": "Korean",
-    "lao": "Lao",
-    "lit": "Lithuanian",
-    "ltz": "Luxembourgish",
-    "lug": "Ganda",
-    "luo": "Luo",
-    "lvs": "Standard Latvian",
-    "mai": "Maithili",
-    "mal": "Malayalam",
-    "mar": "Marathi",
-    "mkd": "Macedonian",
-    "mlt": "Maltese",
-    "mni": "Meitei",
-    "mya": "Burmese",
-    "nld": "Dutch",
-    "nno": "Norwegian Nynorsk",
-    "nob": "Norwegian Bokm\u00e5l",
-    "npi": "Nepali",
-    "nya": "Nyanja",
-    "oci": "Occitan",
-    "ory": "Odia",
-    "pan": "Punjabi",
-    "pbt": "Southern Pashto",
-    "pes": "Western Persian",
-    "pol": "Polish",
-    "por": "Portuguese",
-    "ron": "Romanian",
-    "rus": "Russian",
-    "slk": "Slovak",
-    "slv": "Slovenian",
-    "sna": "Shona",
-    "snd": "Sindhi",
-    "som": "Somali",
-    "spa": "Spanish",
-    "srp": "Serbian",
-    "swe": "Swedish",
-    "swh": "Swahili",
-    "tam": "Tamil",
-    "tel": "Telugu",
-    "tgk": "Tajik",
-    "tgl": "Tagalog",
-    "tha": "Thai",
-    "tur": "Turkish",
-    "ukr": "Ukrainian",
-    "urd": "Urdu",
-    "uzn": "Northern Uzbek",
-    "vie": "Vietnamese",
-    "xho": "Xhosa",
-    "yor": "Yoruba",
-    "yue": "Cantonese",
-    "zlm": "Colloquial Malay",
-    "zsm": "Standard Malay",
-    "zul": "Zulu",
-}
-LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
-# Source langs: S2ST / S2TT / ASR don't need source lang
-# T2TT / T2ST use this
-text_source_language_codes = [
-    "afr",
-    "amh",
-    "arb",
-    "ary",
-    "arz",
-    "asm",
-    "azj",
-    "bel",
-    "ben",
-    "bos",
-    "bul",
-    "cat",
-    "ceb",
-    "ces",
-    "ckb",
-    "cmn",
-    "cym",
-    "dan",
-    "deu",
-    "ell",
-    "eng",
-    "est",
-    "eus",
-    "fin",
-    "fra",
-    "gaz",
-    "gle",
-    "glg",
-    "guj",
-    "heb",
-    "hin",
-    "hrv",
-    "hun",
-    "hye",
-    "ibo",
-    "ind",
-    "isl",
-    "ita",
-    "jav",
-    "jpn",
-    "kan",
-    "kat",
-    "kaz",
-    "khk",
-    "khm",
-    "kir",
-    "kor",
-    "lao",
-    "lit",
-    "lug",
-    "luo",
-    "lvs",
-    "mai",
-    "mal",
-    "mar",
-    "mkd",
-    "mlt",
-    "mni",
-    "mya",
-    "nld",
-    "nno",
-    "nob",
-    "npi",
-    "nya",
-    "ory",
-    "pan",
-    "pbt",
-    "pes",
-    "pol",
-    "por",
-    "ron",
-    "rus",
-    "slk",
-    "slv",
-    "sna",
-    "snd",
-    "som",
-    "spa",
-    "srp",
-    "swe",
-    "swh",
-    "tam",
-    "tel",
-    "tgk",
-    "tgl",
-    "tha",
-    "tur",
-    "ukr",
-    "urd",
-    "uzn",
-    "vie",
-    "yor",
-    "yue",
-    "zsm",
-    "zul",
-]
-TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
-# Target langs:
-# S2ST / T2ST
-s2st_target_language_codes = [
-    "eng",
-    "arb",
-    "ben",
-    "cat",
-    "ces",
-    "cmn",
-    "cym",
-    "dan",
-    "deu",
-    "est",
-    "fin",
-    "fra",
-    "hin",
-    "ind",
-    "ita",
-    "jpn",
-    "kor",
-    "mlt",
-    "nld",
-    "pes",
-    "pol",
-    "por",
-    "ron",
-    "rus",
-    "slk",
-    "spa",
-    "swe",
-    "swh",
-    "tel",
-    "tgl",
-    "tha",
-    "tur",
-    "ukr",
-    "urd",
-    "uzn",
-    "vie",
-]
-S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
-# S2TT / ASR
-S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
-# T2TT
-T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES

m4t_app.py DELETED Viewed

@@ -1,463 +0,0 @@
-from __future__ import annotations
-import os
-import gradio as gr
-import numpy as np
-import torch
-import torchaudio
-from seamless_communication.models.inference.translator import Translator
-from lang_list import (
-    LANGUAGE_NAME_TO_CODE,
-    S2ST_TARGET_LANGUAGE_NAMES,
-    S2TT_TARGET_LANGUAGE_NAMES,
-    T2TT_TARGET_LANGUAGE_NAMES,
-    TEXT_SOURCE_LANGUAGE_NAMES,
-)
-DESCRIPTION = """# SeamlessM4T
-# mduppes aaaaaa
-[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
-translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
-This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
-translation and more, without relying on multiple separate models.
-"""
-CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
-TASK_NAMES = [
-    "S2ST (Speech to Speech translation)",
-    "S2TT (Speech to Text translation)",
-    "T2ST (Text to Speech translation)",
-    "T2TT (Text to Text translation)",
-    "ASR (Automatic Speech Recognition)",
-]
-AUDIO_SAMPLE_RATE = 16000.0
-MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
-DEFAULT_TARGET_LANGUAGE = "French"
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("DEVICE", device)
-translator = Translator(
-    model_name_or_card="seamlessM4T_medium",
-    vocoder_name_or_card="vocoder_36langs",
-    device=device,
-    # dtype=torch.float16,
-    # For CPU Mode need to use 32, float16 causes errors downstream
-    dtype=torch.float32,
-)
-def get_translator():
-    return translator
-def transcribe(audio):
-    print(audio)
-    text = p(audio)["text"]
-    return text
-def transcribe_state(audio, state = ""):
-    print(audio)
-    text = p(audio)["text"]
-    state += text + " "
-    return state, state
-def predict(
-    task_name: str,
-    audio_source: str,
-    input_audio_mic: str | None,
-    input_audio_file: str | None,
-    input_text: str | None,
-    source_language: str | None,
-    target_language: str,
-) -> tuple[tuple[int, np.ndarray] | None, str]:
-    task_name = task_name.split()[0]
-    source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
-    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
-    if task_name in ["S2ST", "S2TT", "ASR"]:
-        if audio_source == "microphone":
-            input_data = input_audio_mic
-        else:
-            input_data = input_audio_file
-        arr, org_sr = torchaudio.load(input_data)
-        print(task_name, audio_source, input_audio_mic, type(input_audio_file), type(input_text), source_language, target_language)
-        new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
-        max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
-        if new_arr.shape[1] > max_length:
-            new_arr = new_arr[:, :max_length]
-            gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
-        torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
-    else:
-        input_data = input_text
-    text_out, wav, sr = translator.predict(
-        input=input_data,
-        task_str=task_name,
-        tgt_lang=target_language_code,
-        src_lang=source_language_code,
-        ngram_filtering=True,
-        sample_rate=AUDIO_SAMPLE_RATE,
-    )
-    print("translation response", text_out, wav, sr)
-    # text_out = "Testing"
-    # return None, text_out
-    if task_name in ["S2ST", "T2ST"]:
-        return (sr, wav.cpu().detach().numpy()), text_out
-    else:
-        return None, text_out
-def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
-    return predict(
-        task_name="S2ST",
-        audio_source="file",
-        input_audio_mic=None,
-        input_audio_file=input_audio_file,
-        input_text=None,
-        source_language=None,
-        target_language=target_language,
-    )
-def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
-    return predict(
-        task_name="S2TT",
-        audio_source="file",
-        input_audio_mic=None,
-        input_audio_file=input_audio_file,
-        input_text=None,
-        source_language=None,
-        target_language=target_language,
-    )
-def process_t2st_example(
-    input_text: str, source_language: str, target_language: str
-) -> tuple[tuple[int, np.ndarray] | None, str]:
-    return predict(
-        task_name="T2ST",
-        audio_source="",
-        input_audio_mic=None,
-        input_audio_file=None,
-        input_text=input_text,
-        source_language=source_language,
-        target_language=target_language,
-    )
-def process_t2tt_example(
-    input_text: str, source_language: str, target_language: str
-) -> tuple[tuple[int, np.ndarray] | None, str]:
-    return predict(
-        task_name="T2TT",
-        audio_source="",
-        input_audio_mic=None,
-        input_audio_file=None,
-        input_text=input_text,
-        source_language=source_language,
-        target_language=target_language,
-    )
-def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
-    return predict(
-        task_name="ASR",
-        audio_source="file",
-        input_audio_mic=None,
-        input_audio_file=input_audio_file,
-        input_text=None,
-        source_language=None,
-        target_language=target_language,
-    )
-def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
-    mic = audio_source == "microphone"
-    return (
-        gr.update(visible=mic, value=None),  # input_audio_mic
-        gr.update(visible=not mic, value=None),  # input_audio_file
-    )
-def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
-    task_name = task_name.split()[0]
-    if task_name == "S2ST":
-        return (
-            gr.update(visible=True),  # audio_box
-            gr.update(visible=False),  # input_text
-            gr.update(visible=False),  # source_language
-            gr.update(
-                visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
-            ),  # target_language
-        )
-    elif task_name == "S2TT":
-        return (
-            gr.update(visible=True),  # audio_box
-            gr.update(visible=False),  # input_text
-            gr.update(visible=False),  # source_language
-            gr.update(
-                visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
-            ),  # target_language
-        )
-    elif task_name == "T2ST":
-        return (
-            gr.update(visible=False),  # audio_box
-            gr.update(visible=True),  # input_text
-            gr.update(visible=True),  # source_language
-            gr.update(
-                visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
-            ),  # target_language
-        )
-    elif task_name == "T2TT":
-        return (
-            gr.update(visible=False),  # audio_box
-            gr.update(visible=True),  # input_text
-            gr.update(visible=True),  # source_language
-            gr.update(
-                visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
-            ),  # target_language
-        )
-    elif task_name == "ASR":
-        return (
-            gr.update(visible=True),  # audio_box
-            gr.update(visible=False),  # input_text
-            gr.update(visible=False),  # source_language
-            gr.update(
-                visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
-            ),  # target_language
-        )
-    else:
-        raise ValueError(f"Unknown task: {task_name}")
-def update_output_ui(task_name: str) -> tuple[dict, dict]:
-    task_name = task_name.split()[0]
-    if task_name in ["S2ST", "T2ST"]:
-        return (
-            gr.update(visible=True, value=None),  # output_audio
-            gr.update(value=None),  # output_text
-        )
-    elif task_name in ["S2TT", "T2TT", "ASR"]:
-        return (
-            gr.update(visible=False, value=None),  # output_audio
-            gr.update(value=None),  # output_text
-        )
-    else:
-        raise ValueError(f"Unknown task: {task_name}")
-def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
-    task_name = task_name.split()[0]
-    return (
-        gr.update(visible=task_name == "S2ST"),  # s2st_example_row
-        gr.update(visible=task_name == "S2TT"),  # s2tt_example_row
-        gr.update(visible=task_name == "T2ST"),  # t2st_example_row
-        gr.update(visible=task_name == "T2TT"),  # t2tt_example_row
-        gr.update(visible=task_name == "ASR"),  # asr_example_row
-    )
-def m4t_demo():
-    with gr.Blocks(css="style.css") as demo:
-        gr.Markdown(DESCRIPTION)
-        gr.DuplicateButton(
-            value="Duplicate Space for private use",
-            elem_id="duplicate-button",
-            visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
-        )
-        with gr.Group():
-            task_name = gr.Dropdown(
-                label="Task",
-                choices=TASK_NAMES,
-                value=TASK_NAMES[0],
-            )
-            with gr.Row():
-                source_language = gr.Dropdown(
-                    label="Source language",
-                    choices=TEXT_SOURCE_LANGUAGE_NAMES,
-                    value="English",
-                    visible=False,
-                )
-                target_language = gr.Dropdown(
-                    label="Target language",
-                    choices=S2ST_TARGET_LANGUAGE_NAMES,
-                    value=DEFAULT_TARGET_LANGUAGE,
-                )
-            with gr.Row() as audio_box:
-                audio_source = gr.Radio(
-                    label="Audio source",
-                    choices=["file", "microphone"],
-                    value="file",
-                )
-                input_audio_mic = gr.Audio(
-                    label="Input speech",
-                    type="filepath",
-                    source="microphone",
-                    visible=False,
-                )
-                input_audio_file = gr.Audio(
-                    label="Input speech",
-                    type="filepath",
-                    source="upload",
-                    visible=True,
-                )
-            input_text = gr.Textbox(label="Input text", visible=False)
-            btn = gr.Button("Translate")
-            with gr.Column():
-                output_audio = gr.Audio(
-                    label="Translated speech",
-                    autoplay=False,
-                    streaming=False,
-                    type="numpy",
-                )
-                output_text = gr.Textbox(label="Translated text")
-        with gr.Row(visible=True) as s2st_example_row:
-            s2st_examples = gr.Examples(
-                examples=[
-                    ["assets/sample_input.mp3", "French"],
-                    ["assets/sample_input.mp3", "Mandarin Chinese"],
-                    ["assets/sample_input_2.mp3", "Hindi"],
-                    ["assets/sample_input_2.mp3", "Spanish"],
-                ],
-                inputs=[input_audio_file, target_language],
-                outputs=[output_audio, output_text],
-                fn=process_s2st_example,
-                cache_examples=CACHE_EXAMPLES,
-            )
-        with gr.Row(visible=False) as s2tt_example_row:
-            s2tt_examples = gr.Examples(
-                examples=[
-                    ["assets/sample_input.mp3", "French"],
-                    ["assets/sample_input.mp3", "Mandarin Chinese"],
-                    ["assets/sample_input_2.mp3", "Hindi"],
-                    ["assets/sample_input_2.mp3", "Spanish"],
-                ],
-                inputs=[input_audio_file, target_language],
-                outputs=[output_audio, output_text],
-                fn=process_s2tt_example,
-                cache_examples=CACHE_EXAMPLES,
-            )
-        with gr.Row(visible=False) as t2st_example_row:
-            t2st_examples = gr.Examples(
-                examples=[
-                    ["My favorite animal is the elephant.", "English", "French"],
-                    ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
-                    [
-                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
-                        "English",
-                        "Hindi",
-                    ],
-                    [
-                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
-                        "English",
-                        "Spanish",
-                    ],
-                ],
-                inputs=[input_text, source_language, target_language],
-                outputs=[output_audio, output_text],
-                fn=process_t2st_example,
-                cache_examples=CACHE_EXAMPLES,
-            )
-        with gr.Row(visible=False) as t2tt_example_row:
-            t2tt_examples = gr.Examples(
-                examples=[
-                    ["My favorite animal is the elephant.", "English", "French"],
-                    ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
-                    [
-                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
-                        "English",
-                        "Hindi",
-                    ],
-                    [
-                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
-                        "English",
-                        "Spanish",
-                    ],
-                ],
-                inputs=[input_text, source_language, target_language],
-                outputs=[output_audio, output_text],
-                fn=process_t2tt_example,
-                cache_examples=CACHE_EXAMPLES,
-            )
-        with gr.Row(visible=False) as asr_example_row:
-            asr_examples = gr.Examples(
-                examples=[
-                    ["assets/sample_input.mp3", "English"],
-                    ["assets/sample_input_2.mp3", "English"],
-                ],
-                inputs=[input_audio_file, target_language],
-                outputs=[output_audio, output_text],
-                fn=process_asr_example,
-                cache_examples=CACHE_EXAMPLES,
-            )
-        audio_source.change(
-            fn=update_audio_ui,
-            inputs=audio_source,
-            outputs=[
-                input_audio_mic,
-                input_audio_file,
-            ],
-            queue=False,
-            api_name=False,
-        )
-        task_name.change(
-            fn=update_input_ui,
-            inputs=task_name,
-            outputs=[
-                audio_box,
-                input_text,
-                source_language,
-                target_language,
-            ],
-            queue=False,
-            api_name=False,
-        ).then(
-            fn=update_output_ui,
-            inputs=task_name,
-            outputs=[output_audio, output_text],
-            queue=False,
-            api_name=False,
-        ).then(
-            fn=update_example_ui,
-            inputs=task_name,
-            outputs=[
-                s2st_example_row,
-                s2tt_example_row,
-                t2st_example_row,
-                t2tt_example_row,
-                asr_example_row,
-            ],
-            queue=False,
-            api_name=False,
-        )
-        btn.click(
-            fn=predict,
-            inputs=[
-                task_name,
-                audio_source,
-                input_audio_mic,
-                input_audio_file,
-                input_text,
-                source_language,
-                target_language,
-            ],
-            outputs=[output_audio, output_text],
-            api_name="run",
-        )
-    demo.queue(max_size=50).launch()
-# Linking models to the space
-# 'facebook/seamless-m4t-large'
-# 'facebook/SONAR'

models/vad_s2st_sc_24khz_main.yaml DELETED Viewed

@@ -1,24 +0,0 @@
-agent_class: seamless_communication.streaming.agents.mma_m4t_s2st.SeamlessS2STJointVADAgent
-# checkpoint: checkpoint_best.pt
-monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
-unity_model_name: seamless_streaming_unity
-sentencepiece_model: spm_256k_nllb100.model
-task: s2st
-tgt_lang: "eng"
-min_unit_chunk_size: 50
-decision_threshold: 0.7
-no_early_stop: True
-block_ngrams: True
-vocoder_name: vocoder_pretssel
-wav2vec_yaml: wav2vec.yaml
-# min_starting_wait: 12
-# min_starting_wait_w2vbert: 192
-config_yaml: cfg_fbank_u2t.yaml
-vocoder_sample_rate: 24000
-upstream_idx: 1
-detokenize_only: True
-device: cuda:0
-max_len_a: 0
-max_len_b: 1000

requirements.txt DELETED Viewed

@@ -1,26 +0,0 @@
-# TODO: fairseq2 install is complicated so currently done outside
-# fairseq2==0.1.0
-# git+https://github.com/facebookresearch/seamless_communication
-# ./fairseq2
-# ./seamless_communication
-# comment this out to test fairseq1 first
-# git+https://github.com/facebookresearch/SimulEval.git
-gradio==3.41.0
-huggingface_hub==0.16.4
-# torch==2.1.0
-# torchaudio==2.0.2
-# transformers==4.32.1
-pydub
-g2p_en
-colorlog
-# git+ssh://git@github.com/facebookresearch/SimulEval.git
-# Can't import fairseq1 together.. causes conflict:
-#The conflict is caused by:
-#    The user requested simuleval 1.1.0 (from git+ssh://****@github.com/facebookresearch/SimulEval.git@tree_pipeline)
-#    seamless-communication 1.0.0 depends on simuleval 1.0.3.dev36+gd84fa60 (from git+https://github.com/mduppes/SimulEval.git@main)
-# From fairseq1 pipeline
-# git+ssh://git@github.com/fairinternal/fairseq-py.git@emma_incremental_decoder
-# git+ssh://git@github.com/facebookresearch/SimulEval.git@tree_pipeline

sample_wav.py DELETED Viewed

The diff for this file is too large to render. See raw diff

simuleval_transcoder.py DELETED Viewed

@@ -1,425 +0,0 @@
-from typing import Any, List, Tuple, Union, Optional
-import numpy as np
-import soundfile
-import io
-import asyncio
-from simuleval.agents.pipeline import TreeAgentPipeline
-from simuleval.agents.states import AgentStates
-from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
-import threading
-import math
-import logging
-import sys
-from pathlib import Path
-import time
-from g2p_en import G2p
-import torch
-import traceback
-import time
-import random
-import colorlog
-MODEL_SAMPLE_RATE = 16_000
-logger = logging.getLogger(__name__)
-logger.propagate = False
-handler = colorlog.StreamHandler(stream=sys.stdout)
-formatter = colorlog.ColoredFormatter(
-    "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
-    reset=True,
-    log_colors={
-        "DEBUG": "cyan",
-        "INFO": "green",
-        "WARNING": "yellow",
-        "ERROR": "red",
-        "CRITICAL": "red,bg_white",
-    },
-)
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-logger.setLevel(logging.DEBUG)
-class SpeechAndTextOutput:
-    def __init__(
-        self,
-        text: str = None,
-        speech_samples: list = None,
-        speech_sample_rate: float = None,
-        final: bool = False,
-    ):
-        self.text = text
-        self.speech_samples = speech_samples
-        self.speech_sample_rate = speech_sample_rate
-        self.final = final
-class OutputSegments:
-    def __init__(self, segments: Union[List[Segment], Segment]):
-        if isinstance(segments, Segment):
-            segments = [segments]
-        self.segments: List[Segment] = [s for s in segments]
-    @property
-    def is_empty(self):
-        return all(segment.is_empty for segment in self.segments)
-    @property
-    def finished(self):
-        return all(segment.finished for segment in self.segments)
-    def compute_length(self, g2p):
-        lengths = []
-        for segment in self.segments:
-            if segment.data_type == "text":
-                lengths.append(len([x for x in g2p(segment.content) if x != " "]))
-            elif segment.data_type == "speech":
-                lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
-            elif isinstance(segment, EmptySegment):
-                continue
-            else:
-                logger.warning(
-                    f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
-                )
-        return max(lengths)
-    @classmethod
-    def join_output_buffer(
-        cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
-    ):
-        num_segments = len(buffer[0])
-        for i in range(num_segments):
-            segment_list = [
-                buffer[j][i]
-                for j in range(len(buffer))
-                if buffer[j][i].data_type is not None
-            ]
-            if len(segment_list) == 0:
-                continue
-            if len(set(segment.data_type for segment in segment_list)) != 1:
-                logger.warning(
-                    f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
-                )
-                continue
-            data_type = segment_list[0].data_type
-            if data_type == "text":
-                if output.text is not None:
-                    logger.warning("Multiple text outputs, overwriting!")
-                output.text = " ".join([segment.content for segment in segment_list])
-            elif data_type == "speech":
-                if output.speech_samples is not None:
-                    logger.warning("Multiple speech outputs, overwriting!")
-                speech_out = []
-                for segment in segment_list:
-                    speech_out += segment.content
-                output.speech_samples = speech_out
-                output.speech_sample_rate = segment.sample_rate
-            elif isinstance(segment_list[0], EmptySegment):
-                continue
-            else:
-                logger.warning(
-                    f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
-                )
-        return output
-    def __repr__(self) -> str:
-        repr_str = str(self.segments)
-        return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
-def convert_waveform(
-    waveform: Union[np.ndarray, torch.Tensor],
-    sample_rate: int,
-    normalize_volume: bool = False,
-    to_mono: bool = False,
-    to_sample_rate: Optional[int] = None,
-) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
-    """convert a waveform:
-    - to a target sample rate
-    - from multi-channel to mono channel
-    - volume normalization
-    Args:
-        waveform (numpy.ndarray or torch.Tensor): 2D original waveform
-            (channels x length)
-        sample_rate (int): original sample rate
-        normalize_volume (bool): perform volume normalization
-        to_mono (bool): convert to mono channel if having multiple channels
-        to_sample_rate (Optional[int]): target sample rate
-    Returns:
-        waveform (numpy.ndarray): converted 2D waveform (channels x length)
-        sample_rate (float): target sample rate
-    """
-    try:
-        import torchaudio.sox_effects as ta_sox
-    except ImportError:
-        raise ImportError("Please install torchaudio: pip install torchaudio")
-    effects = []
-    if normalize_volume:
-        effects.append(["gain", "-n"])
-    if to_sample_rate is not None and to_sample_rate != sample_rate:
-        effects.append(["rate", f"{to_sample_rate}"])
-    if to_mono and waveform.shape[0] > 1:
-        effects.append(["channels", "1"])
-    if len(effects) > 0:
-        is_np_input = isinstance(waveform, np.ndarray)
-        _waveform = torch.from_numpy(waveform) if is_np_input else waveform
-        converted, converted_sample_rate = ta_sox.apply_effects_tensor(
-            _waveform, sample_rate, effects
-        )
-        if is_np_input:
-            converted = converted.numpy()
-        return converted, converted_sample_rate
-    return waveform, sample_rate
-class SimulevalTranscoder:
-    def __init__(self, agent, sample_rate, debug, buffer_limit):
-        # agent is stateless
-        self.agent = agent
-        self.input_queue = asyncio.Queue()
-        self.output_queue = asyncio.Queue()
-        self.states = self.agent.build_states()
-        if debug:
-            self.get_states_root().debug = True
-        self.incoming_sample_rate = sample_rate
-        self.close = False
-        self.g2p = G2p()
-        # buffer all outgoing translations within this amount of time
-        self.output_buffer_idle_ms = 5000
-        self.output_buffer_size_limit = (
-            buffer_limit  # phonemes for text, seconds for speech
-        )
-        self.output_buffer_cur_size = 0
-        self.output_buffer: List[List[Segment]] = []
-        self.speech_output_sample_rate = None
-        self.last_output_ts = time.time() * 1000
-        self.timeout_ms = (
-            30000  # close the transcoder thread after this amount of silence
-        )
-        self.first_input_ts = None
-        self.first_output_ts = None
-        self.debug = debug
-        self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
-        if self.debug:
-            debug_folder = Path(__file__).resolve().parent.parent / "debug"
-            self.test_incoming_wav = soundfile.SoundFile(
-                debug_folder / f"{self.debug_ts}_test_incoming.wav",
-                mode="w+",
-                format="WAV",
-                subtype="PCM_16",
-                samplerate=self.incoming_sample_rate,
-                channels=1,
-            )
-            self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
-                debug_folder / f"{self.debug_ts}_test_input_segments.wav",
-                mode="w+",
-                format="WAV",
-                samplerate=MODEL_SAMPLE_RATE,
-                channels=1,
-            )
-    def get_states_root(self) -> AgentStates:
-        if isinstance(self.agent, TreeAgentPipeline):
-            # self.states is a dict
-            return self.states[self.agent.source_module]
-        else:
-            # self.states is a list
-            return self.states[0]
-    def reset_states(self):
-        if isinstance(self.agent, TreeAgentPipeline):
-            states_iter = self.states.values()
-        else:
-            states_iter = self.states
-        for state in states_iter:
-            state.reset()
-    def debug_log(self, *args):
-        if self.debug:
-            logger.info(*args)
-    def process_incoming_bytes(self, incoming_bytes, target_language, sample_rate):
-        # TODO: currently just taking sample rate here, refactor sample rate
-        # bytes is 16bit signed int
-        self.incoming_sample_rate = sample_rate
-        segment, sr = self._preprocess_wav(incoming_bytes)
-        segment = SpeechSegment(
-            content=segment, sample_rate=sr, tgt_lang=target_language
-        )
-        # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
-        self.input_queue.put_nowait(segment)
-        print("process_incoming: put input_queue")
-    def get_input_segment(self):
-        if self.input_queue.empty():
-            return None
-        chunk = self.input_queue.get_nowait()
-        self.input_queue.task_done()
-        return chunk
-    def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
-        segment, sample_rate = soundfile.read(
-            io.BytesIO(data),
-            dtype="float32",
-            always_2d=True,
-            frames=-1,
-            start=0,
-            format="RAW",
-            subtype="PCM_16",
-            samplerate=self.incoming_sample_rate,
-            channels=1,
-        )
-        if self.debug:
-            self.test_incoming_wav.seek(0, soundfile.SEEK_END)
-            self.test_incoming_wav.write(segment)
-        segment = segment.T
-        segment, new_sample_rate = convert_waveform(
-            segment,
-            sample_rate,
-            normalize_volume=False,
-            to_mono=True,
-            to_sample_rate=MODEL_SAMPLE_RATE,
-        )
-        assert MODEL_SAMPLE_RATE == new_sample_rate
-        segment = segment.squeeze(axis=0)
-        return segment, new_sample_rate
-    def process_pipeline_impl(self, input_segment):
-        try:
-            with torch.no_grad():
-                output_segment = OutputSegments(
-                    self.agent.pushpop(input_segment, self.states)
-                )
-            if (
-                self.get_states_root().first_input_ts is not None
-                and self.first_input_ts is None
-            ):
-                # TODO: this is hacky
-                self.first_input_ts = self.get_states_root().first_input_ts
-            if not output_segment.is_empty:
-                print("PUT IN OUTPUT QUEUE")
-                self.output_queue.put_nowait(output_segment)
-            if output_segment.finished:
-                print("OUTPUT SEGMENT IS FINISHED. Resetting states.")
-                self.reset_states()
-                if self.debug:
-                    # when we rebuild states, this value is reset to whatever
-                    # is in the system dir config, which defaults debug=False.
-                    self.get_states_root().debug = True
-        except Exception as e:
-            logger.error(f"Got exception while processing pipeline: {e}")
-            traceback.print_exc()
-        return input_segment
-    def process_pipeline_loop(self):
-        if self.close:
-            print("transcoder closed")
-            return  # closes the thread
-        print("processing_pipeline")
-        while not self.close:
-            input_segment = self.get_input_segment()
-            if input_segment is None:
-                if self.get_states_root().is_fresh_state:  # TODO: this is hacky
-                    time.sleep(0.3)
-                    print("loop: input_queue empty")
-                else:
-                    time.sleep(0.03)
-                continue
-            print("loop: got input_segment")
-            self.process_pipeline_impl(input_segment)
-        print("finished processing_pipeline")
-    def process_pipeline_once(self):
-        if self.close:
-            return
-        self.debug_log("processing pipeline once")
-        input_segment = self.get_input_segment()
-        if input_segment is None:
-            return
-        self.process_pipeline_impl(input_segment)
-        self.debug_log("finished processing_pipeline_once")
-    def get_output_segment(self):
-        if self.output_queue.empty():
-            return None
-        output_chunk = self.output_queue.get_nowait()
-        self.output_queue.task_done()
-        return output_chunk
-    def start(self):
-        print("starting transcoder in a thread")
-        threading.Thread(target=self.process_pipeline_loop).start()
-    def first_translation_time(self):
-        return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
-    def get_buffered_output(self) -> SpeechAndTextOutput:
-        now = time.time() * 1000
-        print(f"get_buffered_output queue size: {self.output_queue.qsize()}")
-        while not self.output_queue.empty():
-            tmp_out = self.get_output_segment()
-            if tmp_out and tmp_out.compute_length(self.g2p) > 0:
-                if len(self.output_buffer) == 0:
-                    self.last_output_ts = now
-                self._populate_output_buffer(tmp_out)
-                self._increment_output_buffer_size(tmp_out)
-                if tmp_out.finished:
-                    self.debug_log("tmp_out.finished")
-                    res = self._gather_output_buffer_data(final=True)
-                    self.debug_log(f"gathered output data: {res}")
-                    self.output_buffer = []
-                    self.increment_output_buffer_size = 0
-                    self.last_output_ts = now
-                    self.first_output_ts = now
-                    return res
-            else:
-                self.debug_log("tmp_out.compute_length is not > 0")
-        if len(self.output_buffer) > 0 and (
-            now - self.last_output_ts >= self.output_buffer_idle_ms
-            or self.output_buffer_cur_size >= self.output_buffer_size_limit
-        ):
-            self.debug_log(
-                "[get_buffered_output] output_buffer is not empty. getting res to return."
-            )
-            self.last_output_ts = now
-            res = self._gather_output_buffer_data(final=False)
-            self.debug_log(f"gathered output data: {res}")
-            self.output_buffer = []
-            self.output_buffer_phoneme_count = 0
-            self.first_output_ts = now
-            return res
-        else:
-            self.debug_log("[get_buffered_output] output_buffer is empty...")
-            return None
-    def _gather_output_buffer_data(self, final):
-        output = SpeechAndTextOutput()
-        output.final = final
-        output = OutputSegments.join_output_buffer(self.output_buffer, output)
-        return output
-    def _increment_output_buffer_size(self, segment: OutputSegments):
-        self.output_buffer_cur_size += segment.compute_length(self.g2p)
-    def _populate_output_buffer(self, segment: OutputSegments):
-        self.output_buffer.append(segment.segments)
-    def _compute_phoneme_count(self, string: str) -> int:
-        return len([x for x in self.g2p(string) if x != " "])

style.css DELETED Viewed

@@ -1,16 +0,0 @@
-h1 {
-  text-align: center;
-}
-#duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-#component-0 {
-  max-width: 730px;
-  margin: auto;
-  padding-top: 1.5rem;
-}