File size: 23,657 Bytes
f655f69
 
 
 
 
 
 
c2fa877
f655f69
 
c2fa877
f655f69
 
 
 
 
 
 
c2fa877
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
c2fa877
f655f69
 
 
 
 
 
 
 
c2fa877
 
f655f69
 
 
c2fa877
f655f69
 
 
95849c2
f655f69
 
 
 
 
 
c2fa877
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fa877
 
 
 
 
f655f69
c2fa877
 
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fa877
 
f655f69
c2fa877
 
 
 
 
 
 
 
f655f69
c2fa877
 
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fa877
f655f69
 
 
 
 
 
c2fa877
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
import asyncio
import os
from asyncio import TaskGroup
from pathlib import Path
from typing import Any, Callable, List
from uuid import uuid4

from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel
from pydub import AudioSegment

from src import tts, utils
from src.config import (
    CONTEXT_CHAR_LEN_FOR_TTS,
    ELEVENLABS_MAX_PARALLEL,
    OPENAI_MAX_PARALLEL,
    logger,
)
from src.lc_callbacks import LCMessageLoggerAsync
from src.preprocess_tts_emotions_chain import TTSParamProcessor
from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsAlignment, TTSTimestampsResponse
from src.select_voice_chain import (
    CharacterPropertiesNullable,
    SelectVoiceChainOutput,
    VoiceSelector,
)
from src.sound_effects_design import (
    SoundEffectDescription,
    SoundEffectsDesignOutput,
    create_sound_effects_design_chain,
)
from src.text_modification_chain import modify_text_chain
from src.text_split_chain import SplitTextOutput, create_split_text_chain
from src.utils import GPTModels, prettify_unknown_character_label
from src.web.constructor import HTMLGenerator
from src.web.utils import (
    create_status_html,
    generate_text_split_inner_html_no_effect,
    generate_text_split_inner_html_with_effects,
    generate_voice_mapping_inner_html,
)


class TTSPhrasesGenerationOutput(BaseModel):
    audio_fps: list[str]
    char2time: TTSTimestampsAlignment


class AudiobookBuilder:
    def __init__(self, rm_artifacts: bool = False):
        self.voice_selector = VoiceSelector()
        self.params_tts_processor = TTSParamProcessor()
        self.rm_artifacts = rm_artifacts
        self.min_sound_effect_duration_sec = 1
        self.sound_effects_prompt_influence = 0.75  # seems to work nicely
        self.html_generator = HTMLGenerator()
        self.name = type(self).__name__

    @staticmethod
    async def _prepare_text_for_tts(text: str) -> str:
        chain = modify_text_chain(llm_model=GPTModels.GPT_4o)
        with get_openai_callback() as cb:
            result = await chain.ainvoke(
                {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
            )
        logger.info(
            f'End of modifying text with caps and symbols(?, !, ...). Openai callback stats: {cb}'
        )
        return result.text_modified

    @staticmethod
    async def _split_text(text: str) -> SplitTextOutput:
        chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
        with get_openai_callback() as cb:
            chain_out = await chain.ainvoke(
                {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
            )
        logger.info(f'end of splitting text into characters. openai callback stats: {cb}')
        return chain_out

    @staticmethod
    async def _design_sound_effects(text: str) -> SoundEffectsDesignOutput:
        chain = create_sound_effects_design_chain(llm_model=GPTModels.GPT_4o)
        with get_openai_callback() as cb:
            res = await chain.ainvoke(
                {"text": text}, config={"callbacks": [LCMessageLoggerAsync()]}
            )
        logger.info(
            f'designed {len(res.sound_effects_descriptions)} sound effects. '
            f'openai callback stats: {cb}'
        )
        return res

    async def _map_characters_to_voices(
        self, text_split: SplitTextOutput
    ) -> SelectVoiceChainOutput:
        chain = self.voice_selector.create_voice_mapping_chain(llm_model=GPTModels.GPT_4o)
        with get_openai_callback() as cb:
            chain_out = await chain.ainvoke(
                {
                    "text": text_split.text_annotated,
                    "characters": text_split.characters,
                },
                config={"callbacks": [LCMessageLoggerAsync()]},
            )
        logger.info(f'end of mapping characters to voices. openai callback stats: {cb}')
        return chain_out

    async def _prepare_params_for_tts(self, text_split: SplitTextOutput) -> list[TTSParams]:
        semaphore = asyncio.Semaphore(OPENAI_MAX_PARALLEL)

        async def run_task_with_semaphore(func, **params):
            async with semaphore:
                outputs = await func(**params)
                return outputs

        tasks = []

        for character_phrase in text_split.phrases:
            tasks.append(
                run_task_with_semaphore(
                    func=self.params_tts_processor.run,
                    text=character_phrase.text,
                )
            )

        tts_tasks_results = await asyncio.gather(*tasks)

        return tts_tasks_results

    @staticmethod
    def _add_voice_ids_to_tts_params(
        text_split: SplitTextOutput,
        tts_params_list: list[TTSParams],
        character2voice: dict[str, str],
    ) -> list[TTSParams]:
        for character_phrase, params in zip(text_split.phrases, tts_params_list):
            params.voice_id = character2voice[character_phrase.character]
        return tts_params_list

    @staticmethod
    def _get_left_and_right_contexts_for_each_phrase(
        phrases, context_length=CONTEXT_CHAR_LEN_FOR_TTS
    ):
        """
        Return phrases from left and right sides which don't exceed `context_length`.
        Approx. number of words/tokens based on `context_length` can be calculated by dividing it by 5.
        """
        # TODO: split first context phrase if it exceeds `context_length`, currently it's not added.
        # TODO: optimize algorithm to linear time using sliding window on top of cumulative length sums.
        left_right_contexts = []
        for i in range(len(phrases)):
            left_text, right_text = '', ''
            for j in range(i - 1, -1, -1):
                if len(left_text) + len(phrases[j].text) < context_length:
                    left_text = phrases[j].text + left_text
                else:
                    break
            for phrase in phrases[i + 1 :]:
                if len(right_text) + len(phrase.text) < context_length:
                    right_text += phrase.text
                else:
                    break
            left_right_contexts.append((left_text, right_text))
        return left_right_contexts

    def _add_previous_and_next_context_to_tts_params(
        self,
        text_split: SplitTextOutput,
        tts_params_list: list[TTSParams],
    ) -> list[TTSParams]:
        left_right_contexts = self._get_left_and_right_contexts_for_each_phrase(text_split.phrases)
        for cur_contexts, params in zip(left_right_contexts, tts_params_list):
            left_context, right_context = cur_contexts
            params.previous_text = left_context
            params.next_text = right_context
        return tts_params_list

    @staticmethod
    async def _generate_tts_audio(
        tts_params_list: list[TTSParams],
        out_dp: str,
    ) -> TTSPhrasesGenerationOutput:
        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)

        async def _tts_with_semaphore(params: TTSParams) -> TTSTimestampsResponse:
            async with semaphore:
                return await tts.tts_w_timestamps(params=params)

        tasks = [_tts_with_semaphore(params=params) for params in tts_params_list]
        tts_responses: list[TTSTimestampsResponse] = await asyncio.gather(*tasks)

        tts_audio_fps = []
        for ix, (params, res) in enumerate(zip(tts_params_list, tts_responses), start=1):
            out_fp_no_ext = os.path.join(out_dp, f'tts_output_{ix}')
            out_fp = res.write_audio_to_file(
                filepath_no_ext=out_fp_no_ext, audio_format=params.output_format
            )
            tts_audio_fps.append(out_fp)

        # combine alignments
        alignments = [response.alignment for response in tts_responses]
        char2time = TTSTimestampsAlignment.combine_alignments(alignments=alignments)
        # filter alignments
        char2time = char2time.filter_chars_without_duration()

        return TTSPhrasesGenerationOutput(audio_fps=tts_audio_fps, char2time=char2time)

    def _update_sound_effects_descriptions_with_durations(
        self,
        sound_effects_descriptions: list[SoundEffectDescription],
        char2time: TTSTimestampsAlignment,
    ) -> list[SoundEffectDescription]:
        for sed in sound_effects_descriptions:
            ix_start, ix_end = sed.ix_start_orig_text, sed.ix_end_orig_text
            time_start = char2time.get_start_time_by_char_ix(ix_start, safe=True)
            time_end = char2time.get_end_time_by_char_ix(ix_end, safe=True)
            duration = time_end - time_start
            # apply min effect duration
            duration = max(self.min_sound_effect_duration_sec, duration)
            # update inplace
            sed.start_sec = time_start
            sed.duration_sec = duration
        return sound_effects_descriptions

    # def _filter_short_sound_effects(
    #     self,
    #     sound_effects_descriptions: list[SoundEffectDescription],
    # ) -> list[SoundEffectDescription]:
    #     filtered = [
    #         sed
    #         for sed in sound_effects_descriptions
    #         if sed.duration_sec > self.min_sound_effect_duration_sec
    #     ]

    #     len_orig = len(sound_effects_descriptions)
    #     len_new = len(filtered)
    #     logger.info(
    #         f'{len_new} out of {len_orig} original sound effects are kept '
    #         f'after filtering by min duration: {self.min_sound_effect_duration_sec}'
    #     )

    #     return filtered

    def _sound_effects_description_2_generation_params(
        self,
        sound_effects_descriptions: list[SoundEffectDescription],
    ) -> list[SoundEffectsParams]:
        params = [
            SoundEffectsParams(
                text=sed.prompt,
                duration_seconds=sed.duration_sec,
                prompt_influence=self.sound_effects_prompt_influence,
            )
            for sed in sound_effects_descriptions
        ]
        return params

    @staticmethod
    async def _generate_sound_effects(
        sound_effects_params: list[SoundEffectsParams],
        out_dp: str,
    ) -> list[str]:
        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)

        async def _se_gen_with_semaphore(params: SoundEffectsParams) -> list[bytes]:
            async with semaphore:
                return await tts.sound_generation_consumed(params=params)

        tasks = [_se_gen_with_semaphore(params=params) for params in sound_effects_params]
        results = await asyncio.gather(*tasks)

        se_fps = []
        for ix, task_res in enumerate(results, start=1):
            out_fp = os.path.join(out_dp, f'sound_effect_{ix}.wav')
            utils.write_chunked_bytes(data=task_res, fp=out_fp)
            se_fps.append(out_fp)

        return se_fps

    @staticmethod
    def _save_text_split_debug_data(
        text_split: SplitTextOutput,
        out_dp: str,
    ):
        out_fp = os.path.join(out_dp, 'text_split.json')
        # NOTE: use `to_dict()` for correct conversion
        data = text_split.model_dump()
        utils.write_json(data, fp=out_fp)

    @staticmethod
    def _save_tts_debug_data(
        tts_params_list: list[TTSParams],
        tts_out: TTSPhrasesGenerationOutput,
        out_dp: str,
    ):
        out_fp = os.path.join(out_dp, 'tts.json')
        # NOTE: use `to_dict()` for correct conversion
        data = [param.to_dict() for param in tts_params_list]
        utils.write_json(data, fp=out_fp)

        out_dp = os.path.join(out_dp, 'tts_char2time.csv')
        df_char2time = tts_out.char2time.to_dataframe()
        df_char2time.to_csv(out_dp, index=True)

    @staticmethod
    def _save_sound_effects_debug_data(
        sound_effect_design_output: SoundEffectsDesignOutput,
        sound_effect_descriptions: list[SoundEffectDescription],
        out_dp: str,
    ):
        out_fp = os.path.join(out_dp, 'sound_effects_raw_llm_output.txt')
        utils.write_txt(sound_effect_design_output.text_annotated, fp=out_fp)

        out_fp = os.path.join(out_dp, 'sound_effects_descriptions.json')
        data = [sed.model_dump() for sed in sound_effect_descriptions]
        utils.write_json(data, fp=out_fp)

    @staticmethod
    def _postprocess_tts_audio(audio_fps: list[str], out_dp: str, target_dBFS: float) -> list[str]:
        fps = []
        for in_fp in audio_fps:
            audio_segment = AudioSegment.from_file(in_fp)
            normalized_audio = utils.normalize_audio(audio_segment, target_dBFS)

            out_fp = os.path.join(out_dp, f"{Path(in_fp).stem}.normalized.wav")
            normalized_audio.export(out_fp, format="wav")
            fps.append(out_fp)

        return fps

    @staticmethod
    def _postprocess_sound_effects(
        audio_fps: list[str], out_dp: str, target_dBFS: float, fade_ms: int
    ) -> list[str]:
        fps = []
        for in_fp in audio_fps:
            audio_segment = AudioSegment.from_file(in_fp)

            processed = utils.normalize_audio(audio_segment, target_dBFS)

            processed = processed.fade_in(duration=fade_ms)
            processed = processed.fade_out(duration=fade_ms)

            out_fp = os.path.join(out_dp, f"{Path(in_fp).stem}.postprocessed.wav")
            processed.export(out_fp, format="wav")
            fps.append(out_fp)

        return fps

    @staticmethod
    def _concatenate_audiofiles(audio_fps: list[str], out_wav_fp: str):
        concat = AudioSegment.from_file(audio_fps[0])
        for filename in audio_fps[1:]:
            next_audio = AudioSegment.from_file(filename)
            concat += next_audio
        logger.info(f'saving concatenated audiobook to: "{out_wav_fp}"')
        concat.export(out_wav_fp, format="wav")

    def _get_text_split_html(
        self,
        text_split: SplitTextOutput,
        sound_effects_descriptions: list[SoundEffectDescription] | None,
    ):
        # modify copies of original phrases, keep original intact
        character_phrases = [p.model_copy(deep=True) for p in text_split.phrases]
        for phrase in character_phrases:
            phrase.character = prettify_unknown_character_label(phrase.character)

        if not sound_effects_descriptions:
            inner = generate_text_split_inner_html_no_effect(character_phrases=character_phrases)
        else:
            inner = generate_text_split_inner_html_with_effects(
                character_phrases=character_phrases,
                sound_effects_descriptions=sound_effects_descriptions,
            )

        final = self.html_generator.generate_text_split(inner)
        return final

    def _get_voice_mapping_html(
        self, use_user_voice: bool, select_voice_chain_out: SelectVoiceChainOutput
    ):
        if use_user_voice:
            return ''
        inner = generate_voice_mapping_inner_html(select_voice_chain_out)
        final = self.html_generator.generate_voice_assignments(inner)
        return final

    STAGE_1 = 'Text Analysis'
    STAGE_2 = 'Voices Selection'
    STAGE_3 = 'Audio Generation'

    def _get_yield_data_stage_0(self):
        status = self.html_generator.generate_status("Starting", [("Analyzing Text...", False)])
        return None, "", status

    def _get_yield_data_stage_1(self, text_split_html: str):
        status_html = create_status_html(
            "Text Analysis Complete",
            [(self.STAGE_1, True), ("Selecting Voices...", False)],
        )
        html = status_html + text_split_html
        return None, "", html

    def _get_yield_data_stage_2(self, text_split_html: str, voice_mapping_html: str):
        status_html = create_status_html(
            "Voice Selection Complete",
            [(self.STAGE_1, True), (self.STAGE_2, True), ("Generating Audio...", False)],
        )
        html = status_html + text_split_html + voice_mapping_html + '</div>'
        return None, "", html

    def _get_yield_data_stage_3(
        self, final_audio_fp: str, text_split_html: str, voice_mapping_html: str
    ):
        status_html = create_status_html(
            "Audiobook is ready ✨",
            [(self.STAGE_1, True), (self.STAGE_2, True), (self.STAGE_3, True)],
        )
        third_stage_result_html = (
            status_html
            + text_split_html
            + voice_mapping_html
            + self.html_generator.generate_final_message()
            + '</div>'
        )
        return final_audio_fp, "", third_stage_result_html

    async def run(
        self,
        text: str,
        generate_effects: bool,
        use_user_voice: bool = False,
        voice_id: str | None = None,
    ):
        now_str = utils.get_utc_now_str()
        uuid_trimmed = str(uuid4()).split('-')[0]
        dir_name = f'{now_str}-{uuid_trimmed}'
        out_dp_root = os.path.join('data', 'audiobooks', dir_name)
        os.makedirs(out_dp_root, exist_ok=False)

        debug_dp = os.path.join(out_dp_root, 'debug')
        os.makedirs(debug_dp)

        # TODO: currently, we are constantly writing and reading audio segments from files.
        # I think it will be more efficient to keep all audio in memory.

        # zero stage
        if use_user_voice and not voice_id:
            yield None, "", self.html_generator.generate_message_without_voice_id()

        else:
            yield self._get_yield_data_stage_0()

            text_for_tts = await self._prepare_text_for_tts(text=text)

            # TODO: call sound effects chain in parallel with text split chain
            text_split = await self._split_text(text=text_for_tts)
            self._save_text_split_debug_data(text_split=text_split, out_dp=debug_dp)
            # yield stage 1
            text_split_html = self._get_text_split_html(
                text_split=text_split, sound_effects_descriptions=None
            )
            yield self._get_yield_data_stage_1(text_split_html=text_split_html)

            if generate_effects:
                se_design_output = await self._design_sound_effects(text=text_for_tts)
                se_descriptions = se_design_output.sound_effects_descriptions
                text_split_html = self._get_text_split_html(
                    text_split=text_split, sound_effects_descriptions=se_descriptions
                )

            # TODO: run voice mapping and tts params selection in parallel
            if not use_user_voice:
                select_voice_chain_out = await self._map_characters_to_voices(text_split=text_split)
            else:
                if voice_id is None:
                    raise ValueError(f'voice_id is None')
                select_voice_chain_out = SelectVoiceChainOutput(
                    character2props={
                        char: CharacterPropertiesNullable(gender=None, age_group=None)
                        for char in text_split.characters
                    },
                    character2voice={char: voice_id for char in text_split.characters},
                )
            tts_params_list = await self._prepare_params_for_tts(text_split=text_split)

            # yield stage 2
            voice_mapping_html = self._get_voice_mapping_html(
                use_user_voice=use_user_voice, select_voice_chain_out=select_voice_chain_out
            )
            yield self._get_yield_data_stage_2(
                text_split_html=text_split_html, voice_mapping_html=voice_mapping_html
            )

            tts_params_list = self._add_voice_ids_to_tts_params(
                text_split=text_split,
                tts_params_list=tts_params_list,
                character2voice=select_voice_chain_out.character2voice,
            )

            tts_params_list = self._add_previous_and_next_context_to_tts_params(
                text_split=text_split,
                tts_params_list=tts_params_list,
            )

            tts_dp = os.path.join(out_dp_root, 'tts')
            os.makedirs(tts_dp)
            tts_out = await self._generate_tts_audio(tts_params_list=tts_params_list, out_dp=tts_dp)

            self._save_tts_debug_data(
                tts_params_list=tts_params_list, tts_out=tts_out, out_dp=debug_dp
            )

            if generate_effects:
                se_descriptions = self._update_sound_effects_descriptions_with_durations(
                    sound_effects_descriptions=se_descriptions, char2time=tts_out.char2time
                )

                # no need in filtering, since we ensure the min duration above
                # se_descriptions = self._filter_short_sound_effects(
                #     sound_effects_descriptions=se_descriptions
                # )

                se_params = self._sound_effects_description_2_generation_params(
                    sound_effects_descriptions=se_descriptions
                )

                if len(se_descriptions) != len(se_params):
                    raise ValueError(
                        f'expected {len(se_descriptions)} sound effects params, got: {len(se_params)}'
                    )

                effects_dp = os.path.join(out_dp_root, 'sound_effects')
                os.makedirs(effects_dp)
                se_fps = await self._generate_sound_effects(
                    sound_effects_params=se_params, out_dp=effects_dp
                )

                if len(se_descriptions) != len(se_fps):
                    raise ValueError(
                        f'expected {len(se_descriptions)} generated sound effects, got: {len(se_fps)}'
                    )

                self._save_sound_effects_debug_data(
                    sound_effect_design_output=se_design_output,
                    sound_effect_descriptions=se_descriptions,
                    out_dp=debug_dp,
                )

            tts_normalized_dp = os.path.join(out_dp_root, 'tts_normalized')
            os.makedirs(tts_normalized_dp)
            tts_norm_fps = self._postprocess_tts_audio(
                audio_fps=tts_out.audio_fps,
                out_dp=tts_normalized_dp,
                target_dBFS=-20,
            )

            if generate_effects:
                se_normalized_dp = os.path.join(out_dp_root, 'sound_effects_postprocessed')
                os.makedirs(se_normalized_dp)
                se_norm_fps = self._postprocess_sound_effects(
                    audio_fps=se_fps,
                    out_dp=se_normalized_dp,
                    target_dBFS=-27,
                    fade_ms=500,
                )

            tts_concat_fp = os.path.join(out_dp_root, f'audiobook_{now_str}.wav')
            self._concatenate_audiofiles(audio_fps=tts_norm_fps, out_wav_fp=tts_concat_fp)

            if not generate_effects:
                final_audio_fp = tts_concat_fp
            else:
                tts_concat_with_effects_fp = os.path.join(
                    out_dp_root, f'audiobook_with_effects_{now_str}.wav'
                )
                se_starts_sec = [sed.start_sec for sed in se_descriptions]
                utils.overlay_multiple_audio(
                    main_audio_fp=tts_concat_fp,
                    audios_to_overlay_fps=se_norm_fps,
                    starts_sec=se_starts_sec,
                    out_fp=tts_concat_with_effects_fp,
                )
                final_audio_fp = tts_concat_with_effects_fp

            utils.rm_dir_conditional(dp=out_dp_root, to_remove=self.rm_artifacts)

            # yield stage 3
            yield self._get_yield_data_stage_3(
                final_audio_fp=final_audio_fp,
                text_split_html=text_split_html,
                voice_mapping_html=voice_mapping_html,
            )

        logger.info(f'end of {self.name}.run()')