Spaces:

CAMB-AI
/

mars5_space

App Files Files Community

arnavmehta7 commited on Jun 17

Commit

b0c547a

•

1 Parent(s): 26fdaed

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -21

app.py CHANGED Viewed

@@ -4,16 +4,14 @@ import torch
 import librosa
 from pathlib import Path
 import tempfile, torchaudio
-# from faster_whisper import WhisperModel
 from transformers import pipeline
 from uuid import uuid4
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
-# asr_model = WhisperModel("small", device="cpu", compute_type="int8")
 asr_model = pipeline(
     "automatic-speech-recognition",
-    model="openai/whisper-medium",
     chunk_length_s=30,
     device=torch.device("cuda:0"),
 )
@@ -24,15 +22,16 @@ def transcribe_file(f: str) -> str:
     return " ".join([prediction["text"] for prediction in predictions])
 # Function to process the text and audio input and generate the synthesized output
-def synthesize(text, audio_file, transcript):
-    audio_file = Path(audio_file)
-    temp_file = f"{uuid4()}.{audio_file.suffix}"
-    # copying the audio_file
-    with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
-        dst.write(src.read())
-    audio_file = temp_file
     print(f">>>>> synthesizing! audio_file: {audio_file}")
     if not transcript:
@@ -43,11 +42,10 @@ def synthesize(text, audio_file, transcript):
     wav = torch.from_numpy(wav)
     # Define the configuration for the TTS model
-    deep_clone = True
-    cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3)
     # Generate the synthesized audio
-    ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg)
     # Save the synthesized audio to a temporary file
     output_path = Path(tempfile.mktemp(suffix=".wav"))
@@ -73,7 +71,7 @@ with gr.Blocks() as demo:
     text = gr.Textbox(label="Text to synthesize")
     audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
-    generate_btn = gr.Button(label="Generate Synthesized Audio")
     with gr.Accordion("Advanced Settings", open=False):
         gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
@@ -86,18 +84,77 @@ with gr.Blocks() as demo:
         presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
         rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
         nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
-        meta_n = gr.Slider(minimum=1, maximum=10, step=1, label="meta_n", value=2, interactive=False)
         deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
-        dummy = gr.Number(label='Example number', visible=False)
     output = gr.Audio(label="Synthesized Audio", type="filepath")
-    def on_click(text, audio_file, prompt_text):
         print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
-        of = synthesize(text, audio_file, prompt_text)
         print(f">>>> output file: {of}")
         return of
-    generate_btn.click(on_click, inputs=[text, audio_file, prompt_text], outputs=[output])
 demo.launch(share=False)

 import librosa
 from pathlib import Path
 import tempfile, torchaudio
 from transformers import pipeline
 from uuid import uuid4
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
 asr_model = pipeline(
     "automatic-speech-recognition",
+    model="openai/whisper-tiny",
     chunk_length_s=30,
     device=torch.device("cuda:0"),
 )
     return " ".join([prediction["text"] for prediction in predictions])
 # Function to process the text and audio input and generate the synthesized output
+def synthesize(text, audio_file, transcript, kwargs_dict):
+    print(f">>>>>>> Kwargs dict: {kwargs_dict}")
+    # audio_file = Path(audio_file)
+    # temp_file = f"{uuid4()}.{audio_file.suffix}"
+    # # copying the audio_file
+    # with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
+    #     dst.write(src.read())
+    # audio_file = temp_file
     print(f">>>>> synthesizing! audio_file: {audio_file}")
     if not transcript:
     wav = torch.from_numpy(wav)
     # Define the configuration for the TTS model
+    cfg = config_class(**kwargs_dict)
     # Generate the synthesized audio
+    ar_codes, wav_out = mars5.tts(text, wav, transcript.strip(), cfg=cfg)
     # Save the synthesized audio to a temporary file
     output_path = Path(tempfile.mktemp(suffix=".wav"))
     text = gr.Textbox(label="Text to synthesize")
     audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
+    generate_btn = gr.Button("Generate Synthesized Audio")
     with gr.Accordion("Advanced Settings", open=False):
         gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
         presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
         rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
         nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
         deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
     output = gr.Audio(label="Synthesized Audio", type="filepath")
+    def on_click(
+        text,
+        audio_file,
+        prompt_text,
+        temperature,
+        top_k,
+        top_p,
+        typical_p,
+        freq_penalty,
+        presence_penalty,
+        rep_penalty_window,
+        nar_guidance_w,
+        deep_clone
+    ):
         print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
+        of = synthesize(
+            text,
+            audio_file,
+            prompt_text,
+            {
+                'temperature': temperature,
+                'top_k': top_k,
+                'top_p': top_p,
+                'typical_p': typical_p,
+                'freq_penalty': freq_penalty,
+                'presence_penalty': presence_penalty,
+                'rep_penalty_window': rep_penalty_window,
+                'nar_guidance_w': nar_guidance_w,
+                'deep_clone': deep_clone
+            }
+        )
         print(f">>>> output file: {of}")
         return of
+    generate_btn.click(
+        on_click,
+        inputs=[
+            text,
+            audio_file,
+            prompt_text,
+            temperature,
+            top_k,
+            top_p,
+            typical_p,
+            freq_penalty,
+            presence_penalty,
+            rep_penalty_window,
+            nar_guidance_w,
+            deep_clone
+        ],
+        outputs=[output]
+    )
+    gr.Markdown("### Examples")
+    # Add examples
+    defaults = [0.8, -1, 0.2, 1.0, 2.6, 0.4, 100, 3, True]
+    examples = [
+        ["Today is a wonderful day!", "female_speaker_1.flac", "People look, but no one ever finds it.", *defaults],
+        ["You guys need to figure this out.", "male_speaker_1.flac", "Ask her to bring these things with her from the store.", *defaults]
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=[text, audio_file, prompt_text, temperature, top_k, top_p, typical_p, freq_penalty, presence_penalty, rep_penalty_window, nar_guidance_w, deep_clone],
+        outputs=[output],
+        cache_examples=False,
+        fn=on_click
+    )
 demo.launch(share=False)