ReneeYe commited on
Commit
c826555
β€’
1 Parent(s): 960a1ed

add post-processing, and note to use Chrome.

Browse files
Files changed (1) hide show
  1. app.py +34 -4
app.py CHANGED
@@ -11,6 +11,7 @@ import os
11
  import traceback
12
  import shutil
13
  import yaml
 
14
  from pydub import AudioSegment
15
  import gradio as gr
16
  from huggingface_hub import snapshot_download
@@ -56,9 +57,12 @@ def convert_audio_to_16k_wav(audio_input):
56
  num_channels = sound.channels
57
  num_frames = int(sound.frame_count())
58
  filename = audio_input.split("/")[-1]
 
59
  if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
60
- sound = sound.set_channels(1)
61
- sound = sound.set_frame_rate(16000)
 
 
62
  num_frames = int(sound.frame_count())
63
  filename = filename.replace(".wav", "") + "_16k.wav"
64
  sound.export(f"data/{filename}", format="wav")
@@ -109,6 +113,31 @@ def generate(model_path):
109
  return output.read().strip()
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def remove_temp_files(audio_file):
113
  os.remove("temp.txt")
114
  os.remove("data/test_case.tsv")
@@ -145,8 +174,9 @@ iface = gr.Interface(
145
  examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
146
  title="ConST: an end-to-end speech translator",
147
  description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
148
- 'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details).'
149
- 'This is a live demo for ConST, to translate English into eight European languages.',
 
150
  article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
151
  "thus leveraging MT to help improve ST performance. \n"
152
  "- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \
 
11
  import traceback
12
  import shutil
13
  import yaml
14
+ import re
15
  from pydub import AudioSegment
16
  import gradio as gr
17
  from huggingface_hub import snapshot_download
 
57
  num_channels = sound.channels
58
  num_frames = int(sound.frame_count())
59
  filename = audio_input.split("/")[-1]
60
+ print("original file is at:", audio_input)
61
  if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
62
+ if num_channels > 1:
63
+ sound = sound.set_channels(1)
64
+ if sample_rate != 16000:
65
+ sound = sound.set_frame_rate(16000)
66
  num_frames = int(sound.frame_count())
67
  filename = filename.replace(".wav", "") + "_16k.wav"
68
  sound.export(f"data/{filename}", format="wav")
 
113
  return output.read().strip()
114
 
115
 
116
+ def post_processing(raw_sentence):
117
+ output_sentence = raw_sentence
118
+ if ":" in raw_sentence:
119
+ splited_sent = raw_sentence.split(":")
120
+ if len(splited_sent) == 2:
121
+ prefix = splited_sent[0].strip()
122
+ if len(prefix) <= 3:
123
+ output_sentence = splited_sent[1].strip()
124
+ elif ("(" in prefix) and (")" in prefix):
125
+ bgm = re.findall(r"\(.*?\)", prefix)[0]
126
+ if len(prefix.replace(bgm, "").strip()) <= 3:
127
+ output_sentence = splited_sent[1].strip()
128
+ elif len(splited_sent[1].strip()) > 8:
129
+ output_sentence = splited_sent[1].strip()
130
+
131
+ elif ("(" in raw_sentence) and (")" in raw_sentence):
132
+ bgm_list = re.findall(r"\(.*?\)", raw_sentence)
133
+ for bgm in bgm_list:
134
+ if len(raw_sentence.replace(bgm, "").strip()) > 5:
135
+ output_sentence = output_sentence.replace(bgm, "").strip()
136
+ if len(output_sentence) <= 5:
137
+ output_sentence = raw_sentence
138
+ return output_sentence
139
+
140
+
141
  def remove_temp_files(audio_file):
142
  os.remove("temp.txt")
143
  os.remove("data/test_case.tsv")
 
174
  examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
175
  title="ConST: an end-to-end speech translator",
176
  description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
177
+ 'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details). '
178
+ 'This is a live demo for ConST, to translate English into eight European languages. \n'
179
+ 'p.s. For better experience, we recommend using **Chrome** to record audio.',
180
  article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
181
  "thus leveraging MT to help improve ST performance. \n"
182
  "- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \