maskgct

Runtime error

App Files Files Community

Hecheng0625 commited on Oct 23

Commit

36f9ba6

•

1 Parent(s): fc42837

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -37

app.py CHANGED Viewed

@@ -19,23 +19,41 @@ from models.tts.maskgct.g2p.g2p_generation import g2p, chn_eng_g2p
 from transformers import SeamlessM4TFeatureExtractor
-# import whisperx
 processor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
 device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
-# whisper_model = whisperx.load_model("small", "cuda", compute_type="int8")
-# @torch.no_grad()
-# def get_prompt_text(speech_16k):
-#     asr_result = whisper_model.transcribe(speech_16k)
-#     print("asr_result:", asr_result)
-#     language = asr_result["language"]
-#     #text = asr_result["text"] # whisper asr result
-#     text = asr_result["segments"][0]["text"]
-#     print("prompt text:", text)
-#     return text, language
 def g2p_(text, language):
@@ -279,10 +297,7 @@ def load_models():
 @torch.no_grad()
 def maskgct_inference(
     prompt_speech_path,
-    prompt_text,
     target_text,
-    language="en",
-    target_language="en",
     target_len=None,
     n_timesteps=25,
     cfg=2.5,
@@ -295,13 +310,17 @@ def maskgct_inference(
     speech_16k = librosa.load(prompt_speech_path, sr=16000)[0]
     speech = librosa.load(prompt_speech_path, sr=24000)[0]
-    # if prompt_text is None:
-    #     prompt_text, language = get_prompt_text(prompt_speech_path)
     combine_semantic_code, _ = text2semantic(
         device,
         speech_16k,
-        prompt_text,
         language,
         target_text,
         target_language,
@@ -326,21 +345,15 @@ def maskgct_inference(
 @spaces.GPU
 def inference(
     prompt_wav,
-    prompt_text,
     target_text,
     target_len,
     n_timesteps,
-    language,
-    target_language,
 ):
     save_path = "./output/output.wav"
     os.makedirs("./output", exist_ok=True)
     recovered_audio = maskgct_inference(
         prompt_wav,
-        prompt_text,
         target_text,
-        language,
-        target_language,
         target_len=target_len,
         n_timesteps=int(n_timesteps),
         device=device,
@@ -369,7 +382,6 @@ iface = gr.Interface(
     fn=inference,
     inputs=[
         gr.Audio(label="Upload Prompt Wav", type="filepath"),
-        gr.Textbox(label="Prompt Text"),
         gr.Textbox(label="Target Text"),
         gr.Number(
             label="Target Duration (in seconds), if the target duration is less than 0, the system will estimate a duration.", value=-1
@@ -377,23 +389,13 @@ iface = gr.Interface(
         gr.Slider(
             label="Number of Timesteps", minimum=15, maximum=100, value=25, step=1
         ),
-        gr.Dropdown(label="Language", choices=language_list, value="en"),
-        gr.Dropdown(label="Target Language", choices=language_list, value="en"),
     ],
     outputs=gr.Audio(label="Generated Audio"),
     title="MaskGCT TTS Demo",
     description="""
-    ## MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer
-    [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2409.00750)
-    [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/maskgct)
-    [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-demo-pink)](https://huggingface.co/spaces/amphion/maskgct)
-    [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/tree/main/models/tts/maskgct)
     """
 )
 # Launch the interface
-iface.launch(allowed_paths=["./output"])

 from transformers import SeamlessM4TFeatureExtractor
+import whisper
+import langid
 processor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
 device = torch.device("cuda" if torch.cuda.is_available() else "CPU")
+whisper_model = whisper.load_model("turbo")
+def detect_speech_language(speech_16k):
+    return whisper_model.detect_language(speech_16k)
+def detect_text_language(text):
+    langid.classify(text)[0]
+@torch.no_grad()
+def get_prompt_text(speech_16k, language):
+    full_prompt_text = ""
+    shot_prompt_text = ""
+    short_prompt_end_ts = 0.0
+    asr_result = whisper_model.transcribe(speech_16k, language=language)
+    print("asr_result:", asr_result)
+    full_prompt_text = asr_result["text"] # whisper asr result
+    #text = asr_result["segments"][0]["text"] # whisperx asr result
+    shot_prompt_text = ""
+    short_prompt_end_ts = 0.0
+    for segment in asr_result["segments"]:
+        shot_prompt_text = shot_prompt_text +  segment['text']
+        short_prompt_end_ts = segment['end']
+        if short_prompt_end_ts >= 4:
+            break
+    print("full prompt text:", full_prompt_text, " shot_prompt_text:", shot_prompt_text,
+          "short_prompt_end_ts:", short_prompt_end_ts)
+    return full_prompt_text, shot_prompt_text, short_prompt_end_ts
 def g2p_(text, language):
 @torch.no_grad()
 def maskgct_inference(
     prompt_speech_path,
     target_text,
     target_len=None,
     n_timesteps=25,
     cfg=2.5,
     speech_16k = librosa.load(prompt_speech_path, sr=16000)[0]
     speech = librosa.load(prompt_speech_path, sr=24000)[0]
+    prompt_language = detect_speech_language(speech_16k)
+    full_prompt_text, short_prompt_text, shot_prompt_end_ts = get_prompt_text(prompt_speech_path,
+                                                                              prompt_language)
+    # use the first 4+ seconds wav as the prompt in case the prompt wav is too long
+    speech = speech[0: int(shot_prompt_end_ts * 24000)]
+    speech_16k = speech_16k[0: int(shot_prompt_end_ts*16000)]
+    target_language = detect_text_language(target_text)
     combine_semantic_code, _ = text2semantic(
         device,
         speech_16k,
+        short_prompt_text,
         language,
         target_text,
         target_language,
 @spaces.GPU
 def inference(
     prompt_wav,
     target_text,
     target_len,
     n_timesteps,
 ):
     save_path = "./output/output.wav"
     os.makedirs("./output", exist_ok=True)
     recovered_audio = maskgct_inference(
         prompt_wav,
         target_text,
         target_len=target_len,
         n_timesteps=int(n_timesteps),
         device=device,
     fn=inference,
     inputs=[
         gr.Audio(label="Upload Prompt Wav", type="filepath"),
         gr.Textbox(label="Target Text"),
         gr.Number(
             label="Target Duration (in seconds), if the target duration is less than 0, the system will estimate a duration.", value=-1
         gr.Slider(
             label="Number of Timesteps", minimum=15, maximum=100, value=25, step=1
         ),
     ],
     outputs=gr.Audio(label="Generated Audio"),
     title="MaskGCT TTS Demo",
     description="""
+    [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2409.00750) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/maskgct) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-demo-pink)](https://huggingface.co/spaces/amphion/maskgct) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/tree/main/models/tts/maskgct)
     """
 )
 # Launch the interface
+iface.launch(allowed_paths=["./output"])