Spaces:
Runtime error
Runtime error
ver 1.9.9
Browse files
app.py
CHANGED
@@ -173,14 +173,18 @@ def time_to_seconds(time_str):
|
|
173 |
return seconds
|
174 |
|
175 |
def closest_speedup_factor(factor, allowed_factors):
|
176 |
-
return min(allowed_factors, key=lambda x: abs(x - factor))
|
177 |
|
178 |
def generate_audio_with_pause(srt_file_path):
|
179 |
subtitles = read_srt(srt_file_path)
|
180 |
audio_clips = []
|
181 |
-
allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
|
182 |
|
183 |
for i, (start_time, end_time, text) in enumerate(subtitles):
|
|
|
|
|
|
|
|
|
184 |
# Generate initial audio
|
185 |
audio_data = model.inference(text=text, speaker_id=speaker_id)
|
186 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
@@ -189,10 +193,14 @@ def generate_audio_with_pause(srt_file_path):
|
|
189 |
desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
|
190 |
current_duration = len(audio_data) / 16000
|
191 |
|
|
|
|
|
|
|
192 |
# Adjust audio speed by speedup
|
193 |
if current_duration > desired_duration:
|
194 |
raw_speedup_factor = current_duration / desired_duration
|
195 |
-
speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
|
|
|
196 |
audio_data = librosa.effects.time_stretch(
|
197 |
y=audio_data,
|
198 |
rate=speedup_factor,
|
@@ -201,16 +209,23 @@ def generate_audio_with_pause(srt_file_path):
|
|
201 |
)
|
202 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
203 |
audio_data = audio_data * 1.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
audio_clips.append(audio_data)
|
206 |
|
207 |
# Add pause
|
208 |
-
if i < len(subtitles) - 1:
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
|
215 |
final_audio = np.concatenate(audio_clips)
|
216 |
|
|
|
173 |
return seconds
|
174 |
|
175 |
def closest_speedup_factor(factor, allowed_factors):
|
176 |
+
return min(allowed_factors, key=lambda x: abs(x - factor)) + 0.1
|
177 |
|
178 |
def generate_audio_with_pause(srt_file_path):
|
179 |
subtitles = read_srt(srt_file_path)
|
180 |
audio_clips = []
|
181 |
+
# allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
|
182 |
|
183 |
for i, (start_time, end_time, text) in enumerate(subtitles):
|
184 |
+
# print("=====================================")
|
185 |
+
# print("Text number:", i)
|
186 |
+
# print(f"Start: {start_time}, End: {end_time}, Text: {text}")
|
187 |
+
|
188 |
# Generate initial audio
|
189 |
audio_data = model.inference(text=text, speaker_id=speaker_id)
|
190 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
|
|
193 |
desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
|
194 |
current_duration = len(audio_data) / 16000
|
195 |
|
196 |
+
# print(f"Time to seconds: {time_to_seconds(start_time)}, {time_to_seconds(end_time)}")
|
197 |
+
# print(f"Desired duration: {desired_duration}, Current duration: {current_duration}")
|
198 |
+
|
199 |
# Adjust audio speed by speedup
|
200 |
if current_duration > desired_duration:
|
201 |
raw_speedup_factor = current_duration / desired_duration
|
202 |
+
# speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
|
203 |
+
speedup_factor = raw_speedup_factor
|
204 |
audio_data = librosa.effects.time_stretch(
|
205 |
y=audio_data,
|
206 |
rate=speedup_factor,
|
|
|
209 |
)
|
210 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
211 |
audio_data = audio_data * 1.2
|
212 |
+
|
213 |
+
if current_duration < desired_duration:
|
214 |
+
padding = int((desired_duration - current_duration) * 16000)
|
215 |
+
audio_data = np.concatenate([np.zeros(padding), audio_data])
|
216 |
+
|
217 |
+
# print(f"Final audio duration: {len(audio_data) / 16000}")
|
218 |
+
# print("=====================================")
|
219 |
|
220 |
audio_clips.append(audio_data)
|
221 |
|
222 |
# Add pause
|
223 |
+
# if i < len(subtitles) - 1:
|
224 |
+
# next_start_time = subtitles[i + 1][0]
|
225 |
+
# pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
|
226 |
+
# if pause_duration > 0.2:
|
227 |
+
# pause_samples = int(pause_duration * 16000)
|
228 |
+
# audio_clips.append(np.zeros(pause_samples))
|
229 |
|
230 |
final_audio = np.concatenate(audio_clips)
|
231 |
|