File size: 5,086 Bytes
d7b2919
 
 
 
 
 
 
 
 
 
 
 
 
b124b4a
d7b2919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b124b4a
d7b2919
 
 
b124b4a
 
d7b2919
 
 
 
b124b4a
d7b2919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-

"""
@Author     : Rong Ye
@Time       : May 2022
@Contact    : yerong@bytedance
@Description:
"""

import os
import shutil
import yaml
import torchaudio
import gradio as gr
from huggingface_hub import snapshot_download


LANGUAGE_CODES = {
    "German": "de",
    "Spanish": "es",
    "French": "fr",
    "Italian": "it",
    "Netherlands": "nl",
    "Portuguese": "pt",
    "Romanian": "ro",
    "Russian": "ru",
}

LANG_GEN_SETUPS = {
    "de": {"beam": 10, "lenpen": 0.7},
    "es": {"beam": 10, "lenpen": 0.7},
    "fr": {"beam": 10, "lenpen": 0.7},
    "it": {"beam": 10, "lenpen": 0.7},
    "nl": {"beam": 10, "lenpen": 0.7},
    "pt": {"beam": 10, "lenpen": 0.7},
    "ro": {"beam": 10, "lenpen": 0.7},
    "ru": {"beam": 10, "lenpen": 0.1},
}

os.system("git clone https://github.com/ReneeYe/ConST")
os.system('mv ConST/* ./')
os.system("pip3 install -r requirements.txt")
os.system("python3 setup.py install")
os.system("python3 ConST/setup.py build_ext --inplace")
os.system("mkdir -p data checkpoint")


huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
print(huggingface_model_dir)

def convert_audio_to_16k_wav(audio_input):
    num_frames = torchaudio.info(audio_input.name).num_frames
    filename = audio_input.name.split("/")[-1]
    shutil.copy(audio_input.name, f'data/{filename}')
    return f'data/{filename}', num_frames


def prepare_tsv(file_name, n_frame, language, task="ST"):
    tgt_lang = LANGUAGE_CODES[language]
    with open("data/test_case.tsv", "w") as f:
        f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
        f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")


def get_vocab_and_yaml(language):
    tgt_lang = LANGUAGE_CODES[language]
    # get: spm_ende.model and spm_ende.txt, and save to data/xxx
    # if exist, no need to download
    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")

    # write yaml file
    abs_path = os.popen("pwd").read().strip()
    yaml_dict = LANG_GEN_SETUPS["tgt_lang"]
    yaml_dict["input_channels"] = 1
    yaml_dict["use_audio_input"] = True
    yaml_dict["prepend_tgt_lang_tag"] = True
    yaml_dict["prepend_src_lang_tag"] = True
    yaml_dict["audio_root"] = os.path.join(abs_path, "data")
    yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
    yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
                                  "sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
    with open("data/config.yaml", "w") as f:
        yaml.dump(yaml_dict, f)


def get_model(language):
    # download models to checkpoint/xxx
    return os.path.join(huggingface_model_dir, f"models/const_en{LANGUAGE_CODES[language]}.pt")


def generate(model_path):
    os.system(f"fairseq-generate data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
                --max-tokens 4000000 --max-source-positions 4000000 \
                --config-yaml config.yaml  --path {model_path} | tee temp.txt")
    output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
    return output.read().strip()


def remove_temp_files():
    os.remove("temp.txt")
    os.remove("data/test_case.tsv")


def run(audio_file, language):
    converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
    prepare_tsv(converted_audio_file, n_frame, language)
    get_vocab_and_yaml(language)
    model_path = get_model(language)
    generated_output = generate(model_path)
    remove_temp_files()
    return generated_output


def greet(audio_file, language):
    print(audio_file.name)
    return f"Hello {language}!!"


inputs = [
        gr.inputs.Audio(source="microphone", type="file", label="Record something (in English)..."),
        gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."),
    ]

iface = gr.Interface(
    fn=run,
    inputs=inputs,
    outputs=[gr.outputs.Textbox(label="The translation")],
    examples=[['case1.wav', "German"],['case2.wav', "German"], ['case3.wav', "German"]],
    title="ConST: an end-to-end speech translator",
    description="End-to-end Speech Translation Live Demo for English to eight European languages.",
    article="ConST is an end-to-end speech translation model (see paper <a href='https://arxiv.org/abs/2205.02444', target='_blank'>here</a>). "
            "Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.",
    theme="seafoam",
    layout='vertical',
    # analytics_enabled=False,
    # flagging_dir='results/flagged/',
    # allow_flagging=True,
    # flagging_options=['Interesting!', 'Error: Claim Phrase Parsing', 'Error: Local Premise',
    #                   'Error: Require Commonsense', 'Error: Evidence Retrieval'],
    enable_queue=True
)
iface.launch(inline=False)