Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Chillarmo commited on Nov 5, 2024

Commit

2d29569

verified ·

1 Parent(s): 21852ff

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -21

app.py CHANGED Viewed

@@ -25,13 +25,13 @@ def transcribe_audio(audio_path):
     try:
         # Transcribe with minimal settings for speed
         segments, _ = ASR_MODEL.transcribe(audio_path,
-                                         beam_size=1,           # Reduce beam size
-                                         best_of=1,             # Don't generate alternatives
-                                         temperature=1.0,       # No temperature sampling
-                                         condition_on_previous_text=False,  # Don't condition on previous
-                                         compression_ratio_threshold=2.4,   # Less strict threshold
-                                         log_prob_threshold=-1.0,          # Less strict threshold
-                                         no_speech_threshold=0.6)          # Less strict threshold
         # Combine all segments
         text = " ".join([segment.text for segment in segments]).strip()
@@ -44,29 +44,34 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
     try:
         # If no reference text provided, transcribe the audio
         if not reference_text.strip():
             reference_text = transcribe_audio(audio_path)
             if reference_text.startswith("Error"):
                 return None, reference_text
         # Create speaker from reference audio
         speaker = TTS_INTERFACE.create_speaker(
             audio_path,
-            reference_text
         )
         # Generate speech with cloned voice
         output = TTS_INTERFACE.generate(
-            text=text_to_speak,
             speaker=speaker,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
-            max_lenght=4096
         )
         # Save to temporary file and return path
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
-        return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
     except Exception as e:
         return None, f"Error: {str(e)}"
@@ -78,20 +83,29 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
-    Note: For best results, use clear audio with minimal background noise.
     """)
     with gr.Row():
         with gr.Column():
             # Input components
             audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
             reference_text = gr.Textbox(
                 label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
-                placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
             )
             text_to_speak = gr.Textbox(
-                label="Text to Speak (what you want the cloned voice to say)",
-                placeholder="Enter the text you want the cloned voice to speak"
             )
             with gr.Row():
@@ -101,14 +115,26 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
                                              label="Repetition Penalty")
             # Submit button
-            submit_btn = gr.Button("Generate Voice", variant="primary")
         with gr.Column():
             # Output components
             output_audio = gr.Audio(label="Generated Speech")
-            output_message = gr.Textbox(label="Status", max_lines=3)
-    # Handle submission
     submit_btn.click(
         fn=process_audio_file,
         inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
@@ -118,9 +144,9 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     gr.Markdown("""
     ### Tips for best results:
     1. Use high-quality reference audio (clear speech, minimal background noise)
-    2. If providing reference text manually, ensure it matches the audio exactly
-    3. If using auto-transcription, verify the transcribed text in the status message
-    4. Keep generated text relatively short for better quality
     5. Adjust temperature and repetition penalty if needed:
        - Lower temperature (0.1-0.3) for more consistent output
        - Higher repetition penalty (1.1-1.3) to avoid repetition

     try:
         # Transcribe with minimal settings for speed
         segments, _ = ASR_MODEL.transcribe(audio_path,
+                                         beam_size=1,
+                                         best_of=1,
+                                         temperature=1.0,
+                                         condition_on_previous_text=False,
+                                         compression_ratio_threshold=2.4,
+                                         log_prob_threshold=-1.0,
+                                         no_speech_threshold=0.6)
         # Combine all segments
         text = " ".join([segment.text for segment in segments]).strip()
     try:
         # If no reference text provided, transcribe the audio
         if not reference_text.strip():
+            gr.Info("Transcribing audio...")
             reference_text = transcribe_audio(audio_path)
             if reference_text.startswith("Error"):
                 return None, reference_text
+        gr.Info(f"Using reference text: {reference_text}")
         # Create speaker from reference audio
         speaker = TTS_INTERFACE.create_speaker(
             audio_path,
+            reference_text[:4000]  # Limit reference text length
         )
         # Generate speech with cloned voice
         output = TTS_INTERFACE.generate(
+            text=text_to_speak[:500],  # Limit output text length
             speaker=speaker,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
+            max_lenght=2048  # Reduced from 4096 to avoid errors
         )
         # Save to temporary file and return path
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
+        return temp_file.name, f"""Processing complete!
+Reference text: {reference_text[:500]}...
+(Showing first 500 characters of reference text)"""
     except Exception as e:
         return None, f"Error: {str(e)}"
     This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
+    Note:
+    - For best results, use clear audio with minimal background noise
+    - Reference text is limited to 4000 characters
+    - Output text is limited to 500 characters
     """)
     with gr.Row():
         with gr.Column():
             # Input components
             audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
+            with gr.Row():
+                transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")
             reference_text = gr.Textbox(
                 label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
+                placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
+                lines=3
             )
             text_to_speak = gr.Textbox(
+                label="Text to Speak (what you want the cloned voice to say, max 500 characters)",
+                placeholder="Enter the text you want the cloned voice to speak",
+                lines=3,
+                max_lines=5
             )
             with gr.Row():
                                              label="Repetition Penalty")
             # Submit button
+            submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")
         with gr.Column():
             # Output components
             output_audio = gr.Audio(label="Generated Speech")
+            output_message = gr.Textbox(label="Status", lines=4)
+    # Handle transcription button
+    def transcribe_button(audio):
+        if not audio:
+            return "Please upload audio first."
+        return transcribe_audio(audio)
+    transcribe_btn.click(
+        fn=transcribe_button,
+        inputs=[audio_input],
+        outputs=[reference_text],
+    )
+    # Handle main generation
     submit_btn.click(
         fn=process_audio_file,
         inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
     gr.Markdown("""
     ### Tips for best results:
     1. Use high-quality reference audio (clear speech, minimal background noise)
+    2. Try to keep reference audio under 30 seconds
+    3. If auto-transcription isn't accurate, you can manually correct the text
+    4. Keep generated text short for better quality
     5. Adjust temperature and repetition penalty if needed:
        - Lower temperature (0.1-0.3) for more consistent output
        - Higher repetition penalty (1.1-1.3) to avoid repetition