PicoAudio2

Running on Zero

App Files Files Community

rookie9 commited on Oct 9

Commit

8ae83f3

verified ·

1 Parent(s): bd9bdce

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -5

app.py CHANGED Viewed

@@ -21,10 +21,54 @@ def is_tdc_format_valid(tdc_str):
     except Exception:
         return False
 def infer(input_text, input_onset, input_length, time_control):
     # para
     if input_onset and not is_tdc_format_valid(input_onset):
         input_onset = "random"
     if time_control:
         if not input_onset or not input_length:
             input_json = json.loads(get_time_info(input_text))
@@ -48,12 +92,13 @@ def infer(input_text, input_onset, input_length, time_control):
             waveform[0, 0].cpu().numpy(),
             samplerate=24000,
         )
     return output_wav, str(input_onset)
 demo = gr.Interface(
     fn=infer,
     inputs=[
-        gr.Textbox(label="TCC (caption, required)", value="a dog barks"),
         gr.Textbox(label="TDC (optional, see format)", value="random"),
         gr.Textbox(label="Length (seconds, optional)", value="10.0"),
         gr.Checkbox(label="Enable Time Control", value=False),
@@ -64,11 +109,11 @@ demo = gr.Interface(
     ],
     title="PicoAudio2 Online Inference",
     description=(
-        "TCC (caption) is neto generate audio. "
-        "If you need time control, please enter TDC and length (in seconds). "
         "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
-        "TDC format: \"event1__start1-end1_start2-end2--event2__start1-end1\", for example: "
-        "\"a_dog_barks__1.0-2.0_3.0-4.0--a_man_speaks__5.0-6.0\"."
         "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
     )
 )

     except Exception:
         return False
+def a_to_b(a_str):
+    items = a_str.split(';')
+    result = []
+    for item in items:
+        item = item.strip()
+        if not item:
+            continue
+        if '(' in item and ')' in item:
+            name, times = item.split('(', 1)
+            name = name.strip().replace(' ', '_')
+            times = times.strip(')').replace(', ', '_').replace(',', '_')
+            result.append(f"{name}__{times}")
+    return '--'.join(result)
+def b_to_a(b_str):
+    events = b_str.split('--')
+    result = []
+    for e in events:
+        if '__' not in e:
+            continue
+        name, times = e.split('__', 1)
+        name = name.replace('_', ' ')
+        times = times.replace('_', ', ')
+        result.append(f"{name}({times})")
+    return '; '.join(result)
+def convert_tdc_to_tcc(b_str):
+    events = b_str.split('--')
+    names = []
+    for e in events:
+        if '__' not in e:
+            continue
+        name, _ = e.split('__', 1)
+        name = name.replace('_', ' ')
+        names.append(name)
+    return ' and '.join(names)
 def infer(input_text, input_onset, input_length, time_control):
     # para
+    input_onset = a_to_b(input_onset)
+    if not input_text and input_onset and is_tdc_format_valid(input_onset):
+        input_text = convert_tdc_to_tcc(input_onset)
+    elif not input_text:
+        input_text = "a dog barks"
     if input_onset and not is_tdc_format_valid(input_onset):
         input_onset = "random"
     if time_control:
         if not input_onset or not input_length:
             input_json = json.loads(get_time_info(input_text))
             waveform[0, 0].cpu().numpy(),
             samplerate=24000,
         )
+    #input_onset = b_to_a(input_onset)
     return output_wav, str(input_onset)
 demo = gr.Interface(
     fn=infer,
     inputs=[
+        gr.Textbox(label="TCC (necessary)", value="a dog barks"),
         gr.Textbox(label="TDC (optional, see format)", value="random"),
         gr.Textbox(label="Length (seconds, optional)", value="10.0"),
         gr.Checkbox(label="Enable Time Control", value=False),
     ],
     title="PicoAudio2 Online Inference",
     description=(
+        "TCC (temporal coarse caption) is necessary to generate audio. "
+        "If you need time control, please enter TDC and length (temporal detailed caption, in seconds). "
         "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
+        "TDC format: \"event1(start1-end1, start2-end2); event2(start1-end1, start2-end2...)\", for example: "
+        "\"a dog barks(1.0-2.0, 3.0-4.0); a man speaks(5.0-6.0)\""
         "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
     )
 )