rookie9 commited on
Commit
8ae83f3
·
verified ·
1 Parent(s): bd9bdce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -5
app.py CHANGED
@@ -21,10 +21,54 @@ def is_tdc_format_valid(tdc_str):
21
  except Exception:
22
  return False
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def infer(input_text, input_onset, input_length, time_control):
25
  # para
 
 
 
 
 
 
26
  if input_onset and not is_tdc_format_valid(input_onset):
27
  input_onset = "random"
 
28
  if time_control:
29
  if not input_onset or not input_length:
30
  input_json = json.loads(get_time_info(input_text))
@@ -48,12 +92,13 @@ def infer(input_text, input_onset, input_length, time_control):
48
  waveform[0, 0].cpu().numpy(),
49
  samplerate=24000,
50
  )
 
51
  return output_wav, str(input_onset)
52
 
53
  demo = gr.Interface(
54
  fn=infer,
55
  inputs=[
56
- gr.Textbox(label="TCC (caption, required)", value="a dog barks"),
57
  gr.Textbox(label="TDC (optional, see format)", value="random"),
58
  gr.Textbox(label="Length (seconds, optional)", value="10.0"),
59
  gr.Checkbox(label="Enable Time Control", value=False),
@@ -64,11 +109,11 @@ demo = gr.Interface(
64
  ],
65
  title="PicoAudio2 Online Inference",
66
  description=(
67
- "TCC (caption) is neto generate audio. "
68
- "If you need time control, please enter TDC and length (in seconds). "
69
  "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
70
- "TDC format: \"event1__start1-end1_start2-end2--event2__start1-end1\", for example: "
71
- "\"a_dog_barks__1.0-2.0_3.0-4.0--a_man_speaks__5.0-6.0\"."
72
  "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
73
  )
74
  )
 
21
  except Exception:
22
  return False
23
 
24
+ def a_to_b(a_str):
25
+ items = a_str.split(';')
26
+ result = []
27
+ for item in items:
28
+ item = item.strip()
29
+ if not item:
30
+ continue
31
+ if '(' in item and ')' in item:
32
+ name, times = item.split('(', 1)
33
+ name = name.strip().replace(' ', '_')
34
+ times = times.strip(')').replace(', ', '_').replace(',', '_')
35
+ result.append(f"{name}__{times}")
36
+ return '--'.join(result)
37
+
38
+ def b_to_a(b_str):
39
+ events = b_str.split('--')
40
+ result = []
41
+ for e in events:
42
+ if '__' not in e:
43
+ continue
44
+ name, times = e.split('__', 1)
45
+ name = name.replace('_', ' ')
46
+ times = times.replace('_', ', ')
47
+ result.append(f"{name}({times})")
48
+ return '; '.join(result)
49
+
50
+ def convert_tdc_to_tcc(b_str):
51
+ events = b_str.split('--')
52
+ names = []
53
+ for e in events:
54
+ if '__' not in e:
55
+ continue
56
+ name, _ = e.split('__', 1)
57
+ name = name.replace('_', ' ')
58
+ names.append(name)
59
+ return ' and '.join(names)
60
+
61
  def infer(input_text, input_onset, input_length, time_control):
62
  # para
63
+ input_onset = a_to_b(input_onset)
64
+ if not input_text and input_onset and is_tdc_format_valid(input_onset):
65
+ input_text = convert_tdc_to_tcc(input_onset)
66
+ elif not input_text:
67
+ input_text = "a dog barks"
68
+
69
  if input_onset and not is_tdc_format_valid(input_onset):
70
  input_onset = "random"
71
+
72
  if time_control:
73
  if not input_onset or not input_length:
74
  input_json = json.loads(get_time_info(input_text))
 
92
  waveform[0, 0].cpu().numpy(),
93
  samplerate=24000,
94
  )
95
+ #input_onset = b_to_a(input_onset)
96
  return output_wav, str(input_onset)
97
 
98
  demo = gr.Interface(
99
  fn=infer,
100
  inputs=[
101
+ gr.Textbox(label="TCC (necessary)", value="a dog barks"),
102
  gr.Textbox(label="TDC (optional, see format)", value="random"),
103
  gr.Textbox(label="Length (seconds, optional)", value="10.0"),
104
  gr.Checkbox(label="Enable Time Control", value=False),
 
109
  ],
110
  title="PicoAudio2 Online Inference",
111
  description=(
112
+ "TCC (temporal coarse caption) is necessary to generate audio. "
113
+ "If you need time control, please enter TDC and length (temporal detailed caption, in seconds). "
114
  "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
115
+ "TDC format: \"event1(start1-end1, start2-end2); event2(start1-end1, start2-end2...)\", for example: "
116
+ "\"a dog barks(1.0-2.0, 3.0-4.0); a man speaks(5.0-6.0)\""
117
  "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
118
  )
119
  )