Spaces:

ReneeYe
/

ConST-speech2text-translator

Build error

App Files Files Community

ReneeYe commited on May 30, 2022

Commit

c826555

•

1 Parent(s): 960a1ed

add post-processing, and note to use Chrome.

Browse files

Files changed (1) hide show

app.py +34 -4

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import os
 import traceback
 import shutil
 import yaml
 from pydub import AudioSegment
 import gradio as gr
 from huggingface_hub import snapshot_download
@@ -56,9 +57,12 @@ def convert_audio_to_16k_wav(audio_input):
     num_channels = sound.channels
     num_frames = int(sound.frame_count())
     filename = audio_input.split("/")[-1]
     if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
-        sound = sound.set_channels(1)
-        sound = sound.set_frame_rate(16000)
         num_frames = int(sound.frame_count())
         filename = filename.replace(".wav", "") + "_16k.wav"
         sound.export(f"data/{filename}", format="wav")
@@ -109,6 +113,31 @@ def generate(model_path):
     return output.read().strip()
 def remove_temp_files(audio_file):
     os.remove("temp.txt")
     os.remove("data/test_case.tsv")
@@ -145,8 +174,9 @@ iface = gr.Interface(
     examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
     title="ConST: an end-to-end speech translator",
     description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
-                'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details).'
-                'This is a live demo for ConST, to translate English into eight European languages.',
     article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
             "thus leveraging MT to help improve ST performance. \n"
             "- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \

 import traceback
 import shutil
 import yaml
+import re
 from pydub import AudioSegment
 import gradio as gr
 from huggingface_hub import snapshot_download
     num_channels = sound.channels
     num_frames = int(sound.frame_count())
     filename = audio_input.split("/")[-1]
+    print("original file is at:", audio_input)
     if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
+        if num_channels > 1:
+            sound = sound.set_channels(1)
+        if sample_rate != 16000:
+            sound = sound.set_frame_rate(16000)
         num_frames = int(sound.frame_count())
         filename = filename.replace(".wav", "") + "_16k.wav"
         sound.export(f"data/{filename}", format="wav")
     return output.read().strip()
+def post_processing(raw_sentence):
+    output_sentence = raw_sentence
+    if ":" in raw_sentence:
+        splited_sent = raw_sentence.split(":")
+        if len(splited_sent) == 2:
+            prefix = splited_sent[0].strip()
+            if len(prefix) <= 3:
+                output_sentence = splited_sent[1].strip()
+            elif ("(" in prefix) and (")" in prefix):
+                bgm = re.findall(r"\(.*?\)", prefix)[0]
+                if len(prefix.replace(bgm, "").strip()) <= 3:
+                    output_sentence = splited_sent[1].strip()
+                elif len(splited_sent[1].strip()) > 8:
+                    output_sentence = splited_sent[1].strip()
+    elif ("(" in raw_sentence) and (")" in raw_sentence):
+        bgm_list = re.findall(r"\(.*?\)", raw_sentence)
+        for bgm in bgm_list:
+            if len(raw_sentence.replace(bgm, "").strip()) > 5:
+                output_sentence = output_sentence.replace(bgm, "").strip()
+        if len(output_sentence) <= 5:
+            output_sentence = raw_sentence
+    return output_sentence
 def remove_temp_files(audio_file):
     os.remove("temp.txt")
     os.remove("data/test_case.tsv")
     examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
     title="ConST: an end-to-end speech translator",
     description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
+                'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details). '
+                'This is a live demo for ConST, to translate English into eight European languages. \n'
+                'p.s. For better experience, we recommend using **Chrome** to record audio.',
     article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
             "thus leveraging MT to help improve ST performance. \n"
             "- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \