Chillarmo commited on
Commit
2d29569
·
verified ·
1 Parent(s): 21852ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -21
app.py CHANGED
@@ -25,13 +25,13 @@ def transcribe_audio(audio_path):
25
  try:
26
  # Transcribe with minimal settings for speed
27
  segments, _ = ASR_MODEL.transcribe(audio_path,
28
- beam_size=1, # Reduce beam size
29
- best_of=1, # Don't generate alternatives
30
- temperature=1.0, # No temperature sampling
31
- condition_on_previous_text=False, # Don't condition on previous
32
- compression_ratio_threshold=2.4, # Less strict threshold
33
- log_prob_threshold=-1.0, # Less strict threshold
34
- no_speech_threshold=0.6) # Less strict threshold
35
 
36
  # Combine all segments
37
  text = " ".join([segment.text for segment in segments]).strip()
@@ -44,29 +44,34 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
44
  try:
45
  # If no reference text provided, transcribe the audio
46
  if not reference_text.strip():
 
47
  reference_text = transcribe_audio(audio_path)
48
  if reference_text.startswith("Error"):
49
  return None, reference_text
50
 
 
 
51
  # Create speaker from reference audio
52
  speaker = TTS_INTERFACE.create_speaker(
53
  audio_path,
54
- reference_text
55
  )
56
 
57
  # Generate speech with cloned voice
58
  output = TTS_INTERFACE.generate(
59
- text=text_to_speak,
60
  speaker=speaker,
61
  temperature=temperature,
62
  repetition_penalty=repetition_penalty,
63
- max_lenght=4096
64
  )
65
 
66
  # Save to temporary file and return path
67
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
68
  output.save(temp_file.name)
69
- return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
 
 
70
 
71
  except Exception as e:
72
  return None, f"Error: {str(e)}"
@@ -78,20 +83,29 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
78
  This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
79
  and enter the new text you want to be spoken in the cloned voice.
80
 
81
- Note: For best results, use clear audio with minimal background noise.
 
 
 
82
  """)
83
 
84
  with gr.Row():
85
  with gr.Column():
86
  # Input components
87
  audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
 
 
 
88
  reference_text = gr.Textbox(
89
  label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
90
- placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
 
91
  )
92
  text_to_speak = gr.Textbox(
93
- label="Text to Speak (what you want the cloned voice to say)",
94
- placeholder="Enter the text you want the cloned voice to speak"
 
 
95
  )
96
 
97
  with gr.Row():
@@ -101,14 +115,26 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
101
  label="Repetition Penalty")
102
 
103
  # Submit button
104
- submit_btn = gr.Button("Generate Voice", variant="primary")
105
 
106
  with gr.Column():
107
  # Output components
108
  output_audio = gr.Audio(label="Generated Speech")
109
- output_message = gr.Textbox(label="Status", max_lines=3)
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Handle submission
112
  submit_btn.click(
113
  fn=process_audio_file,
114
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
@@ -118,9 +144,9 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
118
  gr.Markdown("""
119
  ### Tips for best results:
120
  1. Use high-quality reference audio (clear speech, minimal background noise)
121
- 2. If providing reference text manually, ensure it matches the audio exactly
122
- 3. If using auto-transcription, verify the transcribed text in the status message
123
- 4. Keep generated text relatively short for better quality
124
  5. Adjust temperature and repetition penalty if needed:
125
  - Lower temperature (0.1-0.3) for more consistent output
126
  - Higher repetition penalty (1.1-1.3) to avoid repetition
 
25
  try:
26
  # Transcribe with minimal settings for speed
27
  segments, _ = ASR_MODEL.transcribe(audio_path,
28
+ beam_size=1,
29
+ best_of=1,
30
+ temperature=1.0,
31
+ condition_on_previous_text=False,
32
+ compression_ratio_threshold=2.4,
33
+ log_prob_threshold=-1.0,
34
+ no_speech_threshold=0.6)
35
 
36
  # Combine all segments
37
  text = " ".join([segment.text for segment in segments]).strip()
 
44
  try:
45
  # If no reference text provided, transcribe the audio
46
  if not reference_text.strip():
47
+ gr.Info("Transcribing audio...")
48
  reference_text = transcribe_audio(audio_path)
49
  if reference_text.startswith("Error"):
50
  return None, reference_text
51
 
52
+ gr.Info(f"Using reference text: {reference_text}")
53
+
54
  # Create speaker from reference audio
55
  speaker = TTS_INTERFACE.create_speaker(
56
  audio_path,
57
+ reference_text[:4000] # Limit reference text length
58
  )
59
 
60
  # Generate speech with cloned voice
61
  output = TTS_INTERFACE.generate(
62
+ text=text_to_speak[:500], # Limit output text length
63
  speaker=speaker,
64
  temperature=temperature,
65
  repetition_penalty=repetition_penalty,
66
+ max_lenght=2048 # Reduced from 4096 to avoid errors
67
  )
68
 
69
  # Save to temporary file and return path
70
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
71
  output.save(temp_file.name)
72
+ return temp_file.name, f"""Processing complete!
73
+ Reference text: {reference_text[:500]}...
74
+ (Showing first 500 characters of reference text)"""
75
 
76
  except Exception as e:
77
  return None, f"Error: {str(e)}"
 
83
  This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
84
  and enter the new text you want to be spoken in the cloned voice.
85
 
86
+ Note:
87
+ - For best results, use clear audio with minimal background noise
88
+ - Reference text is limited to 4000 characters
89
+ - Output text is limited to 500 characters
90
  """)
91
 
92
  with gr.Row():
93
  with gr.Column():
94
  # Input components
95
  audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
96
+ with gr.Row():
97
+ transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")
98
+
99
  reference_text = gr.Textbox(
100
  label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
101
+ placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
102
+ lines=3
103
  )
104
  text_to_speak = gr.Textbox(
105
+ label="Text to Speak (what you want the cloned voice to say, max 500 characters)",
106
+ placeholder="Enter the text you want the cloned voice to speak",
107
+ lines=3,
108
+ max_lines=5
109
  )
110
 
111
  with gr.Row():
 
115
  label="Repetition Penalty")
116
 
117
  # Submit button
118
+ submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")
119
 
120
  with gr.Column():
121
  # Output components
122
  output_audio = gr.Audio(label="Generated Speech")
123
+ output_message = gr.Textbox(label="Status", lines=4)
124
+
125
+ # Handle transcription button
126
+ def transcribe_button(audio):
127
+ if not audio:
128
+ return "Please upload audio first."
129
+ return transcribe_audio(audio)
130
+
131
+ transcribe_btn.click(
132
+ fn=transcribe_button,
133
+ inputs=[audio_input],
134
+ outputs=[reference_text],
135
+ )
136
 
137
+ # Handle main generation
138
  submit_btn.click(
139
  fn=process_audio_file,
140
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
 
144
  gr.Markdown("""
145
  ### Tips for best results:
146
  1. Use high-quality reference audio (clear speech, minimal background noise)
147
+ 2. Try to keep reference audio under 30 seconds
148
+ 3. If auto-transcription isn't accurate, you can manually correct the text
149
+ 4. Keep generated text short for better quality
150
  5. Adjust temperature and repetition penalty if needed:
151
  - Lower temperature (0.1-0.3) for more consistent output
152
  - Higher repetition penalty (1.1-1.3) to avoid repetition