# -*- coding: utf-8 -*- """ @Author : Rong Ye @Time : May 2022 @Contact : yerong@bytedance @Description: """ import os import shutil import yaml import torchaudio import gradio as gr from huggingface_hub import snapshot_download LANGUAGE_CODES = { "German": "de", "Spanish": "es", "French": "fr", "Italian": "it", "Netherlands": "nl", "Portuguese": "pt", "Romanian": "ro", "Russian": "ru", } LANG_GEN_SETUPS = { "de": {"beam": 10, "lenpen": 0.7}, "es": {"beam": 10, "lenpen": 0.7}, "fr": {"beam": 10, "lenpen": 0.7}, "it": {"beam": 10, "lenpen": 0.7}, "nl": {"beam": 10, "lenpen": 0.7}, "pt": {"beam": 10, "lenpen": 0.7}, "ro": {"beam": 10, "lenpen": 0.7}, "ru": {"beam": 10, "lenpen": 0.1}, } os.system("git clone https://github.com/ReneeYe/ConST") os.system('mv ConST/* ./') os.system("pip3 install -r requirements.txt") os.system("python3 setup.py install") os.system("python3 ConST/setup.py build_ext --inplace") os.system("mkdir -p data checkpoint") huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models") print(huggingface_model_dir) def convert_audio_to_16k_wav(audio_input): num_frames = torchaudio.info(audio_input.name).num_frames filename = audio_input.name.split("/")[-1] shutil.copy(audio_input.name, f'data/{filename}') return f'data/{filename}', num_frames def prepare_tsv(file_name, n_frame, language, task="ST"): tgt_lang = LANGUAGE_CODES[language] with open("data/test_case.tsv", "w") as f: f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n") f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n") def get_vocab_and_yaml(language): tgt_lang = LANGUAGE_CODES[language] # get: spm_ende.model and spm_ende.txt, and save to data/xxx # if exist, no need to download shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data") shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data") # write yaml file abs_path = os.popen("pwd").read().strip() yaml_dict = LANG_GEN_SETUPS["tgt_lang"] yaml_dict["input_channels"] = 1 yaml_dict["use_audio_input"] = True yaml_dict["prepend_tgt_lang_tag"] = True yaml_dict["prepend_src_lang_tag"] = True yaml_dict["audio_root"] = os.path.join(abs_path, "data") yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt" yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece", "sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")} with open("data/config.yaml", "w") as f: yaml.dump(yaml_dict, f) def get_model(language): # download models to checkpoint/xxx return os.path.join(huggingface_model_dir, f"models/const_en{LANGUAGE_CODES[language]}.pt") def generate(model_path): os.system(f"fairseq-generate data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \ --max-tokens 4000000 --max-source-positions 4000000 \ --config-yaml config.yaml --path {model_path} | tee temp.txt") output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3") return output.read().strip() def remove_temp_files(): os.remove("temp.txt") os.remove("data/test_case.tsv") def run(audio_file, language): converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file) prepare_tsv(converted_audio_file, n_frame, language) get_vocab_and_yaml(language) model_path = get_model(language) generated_output = generate(model_path) remove_temp_files() return generated_output def greet(audio_file, language): print(audio_file.name) return f"Hello {language}!!" inputs = [ gr.inputs.Audio(source="microphone", type="file", label="Record something (in English)..."), gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."), ] iface = gr.Interface( fn=run, inputs=inputs, outputs=[gr.outputs.Textbox(label="The translation")], examples=[['case1.wav', "German"],['case2.wav', "German"], ['case3.wav', "German"]], title="ConST: an end-to-end speech translator", description="End-to-end Speech Translation Live Demo for English to eight European languages.", article="ConST is an end-to-end speech translation model (see paper here). " "Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.", theme="seafoam", layout='vertical', # analytics_enabled=False, # flagging_dir='results/flagged/', # allow_flagging=True, # flagging_options=['Interesting!', 'Error: Claim Phrase Parsing', 'Error: Local Premise', # 'Error: Require Commonsense', 'Error: Evidence Retrieval'], enable_queue=True ) iface.launch(inline=False)