speech-to-speech

Paused

App Files Files Community

zongxiao commited on Oct 12, 2023

Commit

cca84a3

•

1 Parent(s): 4df22b0

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -103

app.py CHANGED Viewed

@@ -1,87 +1,3 @@
-# import torch
-# import numpy as np
-# import soundfile as sf
-# from transformers import pipeline
-# from transformers import BarkModel
-# from transformers import AutoProcessor
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# pipe = pipeline(
-#     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
-# )
-# label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
-# processor = AutoProcessor.from_pretrained("suno/bark")
-# model = BarkModel.from_pretrained("suno/bark")
-# model = model.to(device)
-# synthesised_rate = model.generation_config.sample_rate
-# def translate(audio_file):
-#     audio, sampling_rate = sf.read(audio_file)
-#     outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
-#     language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
-#     label_outputs = {}
-#     for pred in language_prediction:
-#         label_outputs[pred["label"]] = pred["score"]
-#     return outputs["text"],label_outputs
-# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
-#     inputs = processor(text_prompt, voice_preset=voice_preset)
-#     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
-#     return speech_output
-# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
-#     translated_text, label_outputs= translate(audio)
-#     synthesised_speech = synthesise(translated_text,voice_preset)
-#     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
-#     return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
-# title = "外国话转中文话"
-# description = """
-# 作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
-# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
-# """
-# examples = [
-#     ["./en.mp3", None],
-#     ["./de.mp3", None],
-#     ["./fr.mp3", None],
-#     ["./it.mp3", None],
-# ]
-# import gradio as gr
-# demo = gr.Blocks()
-# file_transcribe = gr.Interface(
-#     fn=speech_to_speech_translation,
-#     inputs=gr.Audio(source="upload", type="filepath"),
-#     outputs=[
-#         gr.Audio(label="Generated Speech", type="numpy"),
-#         gr.Text(label="Transcription"),
-#         gr.Label(label="Language prediction"),
-#     ],
-#     title=title,
-#     description=description,
-#     examples=examples,
-# )
-# mic_transcribe = gr.Interface(
-#     fn=speech_to_speech_translation,
-#     inputs=gr.Audio(source="microphone", type="filepath"),
-#     outputs=[
-#         gr.Audio(label="Generated Speech", type="numpy"),
-#         gr.Text(label="Transcription"),
-#         gr.Label(label="Language prediction"),
-#     ],
-#     title=title,
-#     description=description,
-# )
-# with demo:
-#     gr.TabbedInterface(
-#         [file_transcribe, mic_transcribe],
-#         ["Transcribe Audio File", "Transcribe Microphone"],
-#     )
-# demo.launch(share=True)
-###########################################################################################################################
 import torch
 import numpy as np
 import soundfile as sf
@@ -94,47 +10,46 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
 )
-#label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 model = BarkModel.from_pretrained("suno/bark")
 model = model.to(device)
 synthesised_rate = model.generation_config.sample_rate
 def translate(audio_file):
-#    audio, sampling_rate = sf.read(audio_file)
-    outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
-#    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
-#    label_outputs = {}
-#    for pred in language_prediction:
-#        label_outputs[pred["label"]] = pred["score"]
-    return outputs["text"]#,label_outputs
 def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
     inputs = processor(text_prompt, voice_preset=voice_preset)
     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
     return speech_output
 def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
-    #translated_text, label_outputs= translate(audio)
-    translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text,voice_preset)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
-    return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
 title = "外国话转中文话"
 description = """
-作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话（CPU演示太慢暂时先去掉了），一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 examples = [
-    ["./en.mp3", None],
-    ["./de.mp3", None],
     ["./fr.mp3", None],
     ["./it.mp3", None],
     ["./nl.mp3", None],
     ["./fi.mp3", None],
-    ["./cs.mp3", None],
-    ["./pl.mp3", None],
 ]
 import gradio as gr
@@ -145,7 +60,7 @@ file_transcribe = gr.Interface(
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
-#        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
@@ -157,7 +72,7 @@ mic_transcribe = gr.Interface(
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
-#        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
@@ -168,4 +83,92 @@ with demo:
         ["Transcribe Audio File", "Transcribe Microphone"],
     )
-demo.launch(share=True)

 import torch
 import numpy as np
 import soundfile as sf
 pipe = pipeline(
     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
 )
+label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
 processor = AutoProcessor.from_pretrained("suno/bark")
 model = BarkModel.from_pretrained("suno/bark")
 model = model.to(device)
 synthesised_rate = model.generation_config.sample_rate
 def translate(audio_file):
+    audio, sampling_rate = sf.read(audio_file)
+    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
+    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
+    label_outputs = {}
+    for pred in language_prediction:
+        label_outputs[pred["label"]] = pred["score"]
+    return outputs["text"],label_outputs
 def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
     inputs = processor(text_prompt, voice_preset=voice_preset)
     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
     return speech_output
 def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
+    translated_text, label_outputs= translate(audio)
     synthesised_speech = synthesise(translated_text,voice_preset)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
 title = "外国话转中文话"
 description = """
+作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 """
 examples = [
+    # ["./en.mp3", None],
+    # ["./de.mp3", None],
     ["./fr.mp3", None],
     ["./it.mp3", None],
     ["./nl.mp3", None],
     ["./fi.mp3", None],
+    # ["./cs.mp3", None],
+    # ["./pl.mp3", None],
 ]
 import gradio as gr
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
+        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
     outputs=[
         gr.Audio(label="Generated Speech", type="numpy"),
         gr.Text(label="Transcription"),
+        gr.Label(label="Language prediction"),
     ],
     title=title,
     description=description,
         ["Transcribe Audio File", "Transcribe Microphone"],
     )
+demo.launch()
+###########################################################################################################################
+# import torch
+# import numpy as np
+# import soundfile as sf
+# from transformers import pipeline
+# from transformers import BarkModel
+# from transformers import AutoProcessor
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# pipe = pipeline(
+#     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
+# )
+# #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
+# processor = AutoProcessor.from_pretrained("suno/bark")
+# model = BarkModel.from_pretrained("suno/bark")
+# model = model.to(device)
+# synthesised_rate = model.generation_config.sample_rate
+# def translate(audio_file):
+# #    audio, sampling_rate = sf.read(audio_file)
+#     outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
+# #    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
+# #    label_outputs = {}
+# #    for pred in language_prediction:
+# #        label_outputs[pred["label"]] = pred["score"]
+#     return outputs["text"]#,label_outputs
+# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
+#     inputs = processor(text_prompt, voice_preset=voice_preset)
+#     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
+#     return speech_output
+# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
+#     #translated_text, label_outputs= translate(audio)
+#     translated_text = translate(audio)
+#     synthesised_speech = synthesise(translated_text,voice_preset)
+#     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+#     return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
+# title = "外国话转中文话"
+# description = """
+# 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话（CPU演示太慢暂时先去掉了），一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
+# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
+# """
+# examples = [
+#     ["./en.mp3", None],
+#     ["./de.mp3", None],
+#     ["./fr.mp3", None],
+#     ["./it.mp3", None],
+#     ["./nl.mp3", None],
+#     ["./fi.mp3", None],
+#     ["./cs.mp3", None],
+#     ["./pl.mp3", None],
+# ]
+# import gradio as gr
+# demo = gr.Blocks()
+# file_transcribe = gr.Interface(
+#     fn=speech_to_speech_translation,
+#     inputs=gr.Audio(source="upload", type="filepath"),
+#     outputs=[
+#         gr.Audio(label="Generated Speech", type="numpy"),
+#         gr.Text(label="Transcription"),
+# #        gr.Label(label="Language prediction"),
+#     ],
+#     title=title,
+#     description=description,
+#     examples=examples,
+# )
+# mic_transcribe = gr.Interface(
+#     fn=speech_to_speech_translation,
+#     inputs=gr.Audio(source="microphone", type="filepath"),
+#     outputs=[
+#         gr.Audio(label="Generated Speech", type="numpy"),
+#         gr.Text(label="Transcription"),
+# #        gr.Label(label="Language prediction"),
+#     ],
+#     title=title,
+#     description=description,
+# )
+# with demo:
+#     gr.TabbedInterface(
+#         [file_transcribe, mic_transcribe],
+#         ["Transcribe Audio File", "Transcribe Microphone"],
+#     )
+# demo.launch()