Spaces:

Audio-AGI
/

WavJourney

Running on A10G

App Files Files Community

zzk1st commited on Aug 24, 2023

Commit

4c4a1b3

•

1 Parent(s): 4146809

Another big commit

Browse files

Files changed (10) hide show

README.md +8 -1
data/voice_presets/metadata.json +4 -4
examples/1.mp4 +0 -0
examples/2.mp4 +0 -0
examples/3.mp4 +0 -0
examples/examples.py +76 -13
pipeline.py +8 -0
services.py +2 -1
ui_client.py +26 -22
utils.py +3 -0

README.md CHANGED Viewed

@@ -45,7 +45,14 @@ export WAVJOURNEY_OPENAI_KEY=your_openai_key_here
 6. Set environment variables for using API services
 ```bash
-export WAVJOURNEY_SERVICE_PORT=8021 WAVJOURNEY_SERVICE_URL=127.0.0.1
 ```

 6. Set environment variables for using API services
 ```bash
+# Set the port for the WAVJOURNEY service to 8021
+export WAVJOURNEY_SERVICE_PORT=8021
+# Set the URL for the WAVJOURNEY service to 127.0.0.1
+export WAVJOURNEY_SERVICE_URL=127.0.0.1
+# Limit the maximum script lines for WAVJOURNEY to 999
+export WAVJOURNEY_MAX_SCRIPT_LINES=999
 ```

data/voice_presets/metadata.json CHANGED Viewed

@@ -34,13 +34,13 @@
         "desc": "a female voice of a off-site news reporter, suitable for news scenario",
         "npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
     },
-    "child": {
-        "id": "child",
         "desc": "a small young boy voice",
         "npz_path": "data/voice_presets/npz/child_boy.npz"
     },
-    "old_man": {
-        "id": "old_man",
         "desc": "a voice of an old man",
         "npz_path": "data/voice_presets/npz/elder_morgen.npz"
     }

         "desc": "a female voice of a off-site news reporter, suitable for news scenario",
         "npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
     },
+    "Child": {
+        "id": "Child",
         "desc": "a small young boy voice",
         "npz_path": "data/voice_presets/npz/child_boy.npz"
     },
+    "Old_man": {
+        "id": "Old_man",
         "desc": "a voice of an old man",
         "npz_path": "data/voice_presets/npz/elder_morgen.npz"
     }

examples/1.mp4 ADDED Viewed

Binary file (365 kB). View file

examples/2.mp4 ADDED Viewed

Binary file (241 kB). View file

examples/3.mp4 ADDED Viewed

Binary file (346 kB). View file

examples/examples.py CHANGED Viewed

@@ -1,24 +1,87 @@
 example1 = {
-    'text': "A hammer is hitting a wooden surface",
-    'table_text': """
-| Audio Type   | Layout     | ID   | Character   | Action   |   Volume | Description   | Length   |
-|--------------|------------|------|-------------|----------|----------|---------------|----------|
-| speech       | foreground | N/A  | Character   | N/A      |      -15 | Hi            | Auto     |
 """,
-    'wav_file': 'examples/example1.wav',
 }
 example2 = {
-    'text': "Two hammer is hitting a wooden surface",
-    'table_text': """
-| Audio Type   | Layout     | ID   | Character   | Action   |   Volume | Description   | Length   |
-|--------------|------------|------|-------------|----------|----------|---------------|----------|
-| speech       | foreground | N/A  | Character   | N/A      |      -15 | Hi            | Auto     |
 """,
-    'wav_file': 'examples/example1.wav',
 }
-examples = [example1,example2]

 example1 = {
+    'text': "An introduction to AI-assisted audio content creation.",
+    'table_script': """
+| Audio Type   | Layout     | ID | Character | Action | Volume | Description                                                      | Length |
+|--------------|------------|----|-----------|--------|--------|------------------------------------------------------------------|--------|
+| music        | background | 1  | N/A       | begin  | -35    | Inspirational technology-themed music                            | Auto   |
+| speech       | foreground | N/A| Narrator  | N/A    | -15    | Welcome to the future of audio content creation.                 | Auto   |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Digital startup sound                                            | 2      |
+| speech       | foreground | N/A| Narrator  | N/A    | -15    | With evolving technology, we are introducing AI-assisted tools for pristine audio production. | Auto |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Keyboard typing noise                                            | 3      |
+| speech       | foreground | N/A| Narrator  | N/A    | -15    | Imagine crafting audio content with the power of AI at your fingertips. | Auto |
+| sound_effect | background | 2  | N/A       | begin  | -35    | Ambiance of a busy control room                                   | Auto   |
+| speech       | foreground | N/A| Narrator  | N/A    | -15    | Enhanced quality, efficient production and limitless creativity, all under one roof. | Auto |
+| sound_effect | background | 2  | N/A       | end    | N/A    | N/A                                                              | Auto   |
+| speech       | foreground | N/A| Narrator  | N/A    | -15    | Unleash your potential with AI-assisted audio content creation.  | Auto   |
+| music        | background | 1  | N/A       | end    | N/A    | N/A                                                              | Auto   |
 """,
+    'table_voice': """
+| Character   | Voice     |
+|-------------|-----------|
+| Narrator    | News_Male |
+""",
+    'wav_file': 'examples/1.mp4',
 }
 example2 = {
+    'text': "A couple dating in a cafe.",
+    'table_script': """
+| Audio Type   | Layout     | ID | Character | Action | Volume | Description                                   | Length |
+|--------------|------------|----|-----------|--------|--------|-----------------------------------------------|--------|
+| sound_effect | background | 1  | N/A       | begin  | -35    | Soft chattering in a cafe                     | Auto   |
+| sound_effect | background | 2  | N/A       | begin  | -38    | Coffee brewing noises                         | Auto   |
+| music        | background | 3  | N/A       | begin  | -35    | Soft jazz playing in the background           | Auto   |
+| speech       | foreground | N/A| Man       | N/A    | -15    | It’s really nice to finally get out and relax a little, isn’t it? | Auto |
+| speech       | foreground | N/A| Woman     | N/A    | -15    | I know, right? We should do this more often.  | Auto   |
+| sound_effect | background | 2  | N/A       | end    | N/A    | N/A                                           | Auto   |
+| speech       | foreground | N/A| Man       | N/A    | -15    | Here’s your coffee, just as you like it.      | Auto   |
+| speech       | foreground | N/A| Woman     | N/A    | -15    | Thank you, it smells wonderful.               | Auto   |
+| music        | background | 3  | N/A       | end    | N/A    | N/A                                           | Auto   |
+| sound_effect | background | 1  | N/A       | end    | N/A    | N/A                                           | Auto   |
+""",
+    'table_voice': """
+| Character   | Voice     |
+|-------------|-----------|
+| Man         | Male1     |
+| Woman       | Female1   |
+""",
+    'wav_file': 'examples/2.mp4',
+}
+example3 = {
+    'text': "A child is participating in a farting contest.",
+    'table_script': """
+| Audio Type   | Layout     | ID | Character | Action | Volume | Description                                          | Length |
+|--------------|------------|----|-----------|--------|--------|------------------------------------------------------|--------|
+| sound_effect | background | 1  | N/A       | begin  | -35    | Outdoor park ambiance, people chattering             | Auto   |
+| music        | background | 2  | N/A       | begin  | -35    | Light comedy theme music, quirky                     | Auto   |
+| speech       | foreground | N/A| Host      | N/A    | -15    | Welcome to the annual Fart Competition.              | Auto   |
+| speech       | foreground | N/A| Host      | N/A    | -15    | Now, let’s welcome our youngest participant.         | Auto   |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Clapping sound                                       | 2      |
+| speech       | foreground | N/A| Child     | N/A    | -15    | Hi, I’m excited to be here.                          | Auto   |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Short, cartoonish duration of a fart sound           | 4      |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Audience laughing and applauding                     | 2      |
+| speech       | foreground | N/A| Host      | N/A    | -15    | Wow, that was impressive! Let’s give another round of applause! | Auto |
+| sound_effect | foreground | N/A| N/A       | N/A    | -35    | Audience clapping and cheering                       | 3      |
+| music        | background | 2  | N/A       | end    | N/A    | N/A                                                  | Auto   |
+| sound_effect | background | 1  | N/A       | end    | N/A    | N/A                                                  | Auto   |
+""",
+    'table_voice': """
+| Character   | Voice     |
+|-------------|-----------|
+| Host        | Male1     |
+| Child       | Child     |
 """,
+    'wav_file': 'examples/3.mp4',
 }
+examples = [example1, example2, example3]

pipeline.py CHANGED Viewed

@@ -194,6 +194,14 @@ def generate_json_file(session_id, input_text, api_key):
 # Function call used by Gradio: json to result wav
 def generate_audio(session_id, json_script, api_key):
     output_path = utils.get_session_path(session_id)
     output_audio_path = utils.get_session_audio_path(session_id)
     voices = voice_presets.get_merged_voice_presets(session_id)

 # Function call used by Gradio: json to result wav
 def generate_audio(session_id, json_script, api_key):
+    def count_lines(content):
+        # Split the string using the newline character and count the non-empty lines
+        return sum(1 for line in content.split('\n') if line.strip())
+    max_lines = utils.get_max_script_lines()
+    if count_lines(json_script) > max_lines:
+        raise ValueError(f'The number of lines of the JSON script has exceeded {max_lines}!')
     output_path = utils.get_session_path(session_id)
     output_audio_path = utils.get_session_audio_path(session_id)
     voices = voice_presets.get_merged_voice_presets(session_id)

services.py CHANGED Viewed

@@ -227,4 +227,5 @@ def parse_voice():
 if __name__ == '__main__':
     service_port = get_service_port()
-    app.run(debug=False, port=service_port)

 if __name__ == '__main__':
     service_port = get_service_port()
+    # We disable multithreading to force services to process one request at a time and avoid CUDA OOM
+    app.run(debug=False, threaded=False, port=service_port)

ui_client.py CHANGED Viewed

@@ -54,7 +54,7 @@ def generate_script_fn(instruction, _state: gr.State):
         json_script = generate_json_file(session_id, instruction, api_key)
         table_text = convert_json_to_md(json_script)
     except Exception as e:
-        gr.Warning(str(e) + traceback.format_exc())
         print(f"Generating script error: {str(e)}")
         traceback.print_exc()
         return [
@@ -99,7 +99,7 @@ def generate_audio_fn(state):
     except Exception as e:
         print(f"Generation audio error: {str(e)}")
         traceback.print_exc()
-        gr.Warning(str(e) + traceback.format_exc())
     return [
         None,
@@ -210,7 +210,7 @@ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
         except Exception as exception:
             print(exception)
             traceback.print_exc()
-            gr.Warning(str(exception) + traceback.format_exc())
     # After added
     dataframe = get_voice_preset_to_list(ui_state)
@@ -451,10 +451,29 @@ with gr.Blocks(css=css) as interface:
         loading_icon = gr.HTML(loading_icon_html)
         share_button = gr.Button(value="Share to community", elem_id="share-btn")
     # System Voice Presets
     gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
-    system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
-                                                 value=system_voice_presets)
     # User Voice Preset Related
     gr.Markdown('# (Optional) Speaker Customization ')
     with gr.Accordion("Click to add speakers", open=False):
@@ -476,22 +495,7 @@ with gr.Blocks(css=css) as interface:
         vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
                         interactive=True)
         vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
-    # add examples
-    from examples.examples import examples as WJExamples
-    def example_fn(idx, _text_input):
-        print('from example', idx, _text_input)
-        example = WJExamples[int(idx)-1]
-        return example['table_text'], gr.make_waveform(example['wav_file'])
-    _idx_input = gr.Textbox(label='Example No')
-    _idx_input.visible=False
-    gr.Examples(
-            [[idx+1, x['text']] for idx, x in enumerate(WJExamples)],
-            fn=example_fn,
-            inputs=[_idx_input, text_input],
-            outputs=[char_voice_map_markdown, audio_output],
-            cache_examples=True,
-        )
     # clear btn, will re-new a session
     clear_btn = gr.ClearButton(value='Clear All')
@@ -579,5 +583,5 @@ with gr.Blocks(css=css) as interface:
     # debug only
     # print_state_btn = gr.Button(value='Print State')
     # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
-interface.queue(concurrency_count=5, max_size=20)
 interface.launch()

         json_script = generate_json_file(session_id, instruction, api_key)
         table_text = convert_json_to_md(json_script)
     except Exception as e:
+        gr.Warning(str(e))
         print(f"Generating script error: {str(e)}")
         traceback.print_exc()
         return [
     except Exception as e:
         print(f"Generation audio error: {str(e)}")
         traceback.print_exc()
+        gr.Warning(str(e))
     return [
         None,
         except Exception as exception:
             print(exception)
             traceback.print_exc()
+            gr.Warning(str(exception))
     # After added
     dataframe = get_voice_preset_to_list(ui_state)
         loading_icon = gr.HTML(loading_icon_html)
         share_button = gr.Button(value="Share to community", elem_id="share-btn")
+    # add examples
+    from examples.examples import examples as WJExamples
+    def example_fn(idx, _text_input):
+        print('from example', idx, _text_input)
+        example = WJExamples[int(idx)-1]
+        print(example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file']))
+        return example['table_script'], example['table_voice'], gr.make_waveform(example['wav_file'])
+    _idx_input = gr.Textbox(label='Example No.')
+    _idx_input.visible=False
+    gr.Examples(
+            [[idx+1, x['text']] for idx, x in enumerate(WJExamples)],
+            fn=example_fn,
+            inputs=[_idx_input, text_input],
+            outputs=[audio_script_markdown, char_voice_map_markdown, audio_output],
+            cache_examples=True,
+        )
     # System Voice Presets
     gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
+    with gr.Accordion("Click to see system speakers", open=False):
+        system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
+                                                    value=system_voice_presets)
     # User Voice Preset Related
     gr.Markdown('# (Optional) Speaker Customization ')
     with gr.Accordion("Click to add speakers", open=False):
         vp_file = gr.File(label='Wav File', type='file', file_types=['.wav'],
                         interactive=True)
         vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
     # clear btn, will re-new a session
     clear_btn = gr.ClearButton(value='Clear All')
     # debug only
     # print_state_btn = gr.Button(value='Print State')
     # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
+interface.queue(concurrency_count=1, max_size=20)
 interface.launch()

utils.py CHANGED Viewed

@@ -77,3 +77,6 @@ def get_api_key():
     api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
     return api_key

     api_key = os.environ.get('WAVJOURNEY_OPENAI_KEY')
     return api_key
+def get_max_script_lines():
+    max_lines = int(os.environ.get('WAVJOURNEY_MAX_SCRIPT_LINES', 999))
+    return max_lines