Spaces:
Build error
Build error
add post-processing, and note to use Chrome.
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import os
|
|
11 |
import traceback
|
12 |
import shutil
|
13 |
import yaml
|
|
|
14 |
from pydub import AudioSegment
|
15 |
import gradio as gr
|
16 |
from huggingface_hub import snapshot_download
|
@@ -56,9 +57,12 @@ def convert_audio_to_16k_wav(audio_input):
|
|
56 |
num_channels = sound.channels
|
57 |
num_frames = int(sound.frame_count())
|
58 |
filename = audio_input.split("/")[-1]
|
|
|
59 |
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
|
60 |
-
|
61 |
-
|
|
|
|
|
62 |
num_frames = int(sound.frame_count())
|
63 |
filename = filename.replace(".wav", "") + "_16k.wav"
|
64 |
sound.export(f"data/{filename}", format="wav")
|
@@ -109,6 +113,31 @@ def generate(model_path):
|
|
109 |
return output.read().strip()
|
110 |
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
def remove_temp_files(audio_file):
|
113 |
os.remove("temp.txt")
|
114 |
os.remove("data/test_case.tsv")
|
@@ -145,8 +174,9 @@ iface = gr.Interface(
|
|
145 |
examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
|
146 |
title="ConST: an end-to-end speech translator",
|
147 |
description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
|
148 |
-
'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details).'
|
149 |
-
'This is a live demo for ConST, to translate English into eight European languages.'
|
|
|
150 |
article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
|
151 |
"thus leveraging MT to help improve ST performance. \n"
|
152 |
"- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \
|
|
|
11 |
import traceback
|
12 |
import shutil
|
13 |
import yaml
|
14 |
+
import re
|
15 |
from pydub import AudioSegment
|
16 |
import gradio as gr
|
17 |
from huggingface_hub import snapshot_download
|
|
|
57 |
num_channels = sound.channels
|
58 |
num_frames = int(sound.frame_count())
|
59 |
filename = audio_input.split("/")[-1]
|
60 |
+
print("original file is at:", audio_input)
|
61 |
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
|
62 |
+
if num_channels > 1:
|
63 |
+
sound = sound.set_channels(1)
|
64 |
+
if sample_rate != 16000:
|
65 |
+
sound = sound.set_frame_rate(16000)
|
66 |
num_frames = int(sound.frame_count())
|
67 |
filename = filename.replace(".wav", "") + "_16k.wav"
|
68 |
sound.export(f"data/{filename}", format="wav")
|
|
|
113 |
return output.read().strip()
|
114 |
|
115 |
|
116 |
+
def post_processing(raw_sentence):
|
117 |
+
output_sentence = raw_sentence
|
118 |
+
if ":" in raw_sentence:
|
119 |
+
splited_sent = raw_sentence.split(":")
|
120 |
+
if len(splited_sent) == 2:
|
121 |
+
prefix = splited_sent[0].strip()
|
122 |
+
if len(prefix) <= 3:
|
123 |
+
output_sentence = splited_sent[1].strip()
|
124 |
+
elif ("(" in prefix) and (")" in prefix):
|
125 |
+
bgm = re.findall(r"\(.*?\)", prefix)[0]
|
126 |
+
if len(prefix.replace(bgm, "").strip()) <= 3:
|
127 |
+
output_sentence = splited_sent[1].strip()
|
128 |
+
elif len(splited_sent[1].strip()) > 8:
|
129 |
+
output_sentence = splited_sent[1].strip()
|
130 |
+
|
131 |
+
elif ("(" in raw_sentence) and (")" in raw_sentence):
|
132 |
+
bgm_list = re.findall(r"\(.*?\)", raw_sentence)
|
133 |
+
for bgm in bgm_list:
|
134 |
+
if len(raw_sentence.replace(bgm, "").strip()) > 5:
|
135 |
+
output_sentence = output_sentence.replace(bgm, "").strip()
|
136 |
+
if len(output_sentence) <= 5:
|
137 |
+
output_sentence = raw_sentence
|
138 |
+
return output_sentence
|
139 |
+
|
140 |
+
|
141 |
def remove_temp_files(audio_file):
|
142 |
os.remove("temp.txt")
|
143 |
os.remove("data/test_case.tsv")
|
|
|
174 |
examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
|
175 |
title="ConST: an end-to-end speech translator",
|
176 |
description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
|
177 |
+
'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details). '
|
178 |
+
'This is a live demo for ConST, to translate English into eight European languages. \n'
|
179 |
+
'p.s. For better experience, we recommend using **Chrome** to record audio.',
|
180 |
article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
|
181 |
"thus leveraging MT to help improve ST performance. \n"
|
182 |
"- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \
|