Spaces:

herimor
/

voxtream

Running on Zero

herimor commited on Sep 27

Commit

0d0d952

1 Parent(s): ae1f9c4

Add max audio length handling

Files changed (3) hide show

app.py CHANGED Viewed

@@ -91,18 +91,20 @@ def main():
                 prompt_audio = gr.Audio(
                     sources=["microphone", "upload"],
                     type="filepath",
-                    label="Prompt audio (3-5 sec of target voice)",
                 )
                 prompt_text = gr.Textbox(
                     lines=3,
-                    label="Prompt transcript",
-                    placeholder="Text that matches the prompt audio (Required)",
                 )
             with gr.Column(scale=1, elem_id="right-col"):
                 target_text = gr.Textbox(
                     lines=3,
-                    label="Target text",
                     placeholder="What you want the model to say",
                 )
                 output_audio = gr.Audio(

                 prompt_audio = gr.Audio(
                     sources=["microphone", "upload"],
                     type="filepath",
+                    label="Prompt audio (3-5 sec of target voice. Max 10 sec)",
                 )
                 prompt_text = gr.Textbox(
                     lines=3,
+                    max_length=config.max_prompt_chars,
+                    label=f"Prompt transcript. Max characters: {config.max_prompt_chars} (Required)",
+                    placeholder="Text that matches the prompt audio",
                 )
             with gr.Column(scale=1, elem_id="right-col"):
                 target_text = gr.Textbox(
                     lines=3,
+                    max_length=config.max_phone_tokens,
+                    label=f"Target text. Max characters: {config.max_phone_tokens}",
                     placeholder="What you want the model to say",
                 )
                 output_audio = gr.Audio(

configs/generator.json CHANGED Viewed

@@ -26,6 +26,9 @@
     "phoneme_dict_name": "phoneme_to_token.json",
     "nltk_resource": "taggers/averaged_perceptron_tagger_eng",
     "aligner": "charsiu/en_w2v2_fc_10ms",
     "cache_prompt": false,
     "phoneme_index_map": {
         "0": [

     "phoneme_dict_name": "phoneme_to_token.json",
     "nltk_resource": "taggers/averaged_perceptron_tagger_eng",
     "aligner": "charsiu/en_w2v2_fc_10ms",
+    "max_prompt_sec": 10,
+    "max_prompt_chars": 250,
+    "max_phone_tokens": 1000,
     "cache_prompt": false,
     "phoneme_index_map": {
         "0": [

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-voxtream==0.1.3
 gradio_client==1.3.0
 pydantic==2.10.6

+voxtream==0.1.4
 gradio_client==1.3.0
 pydantic==2.10.6