pyp1 commited on
Commit
53b6223
1 Parent(s): b61860c

update default prompt, fix number transcription bug

Browse files
.gitignore CHANGED
@@ -24,4 +24,5 @@ thumbs.db
24
  src/audiocraft
25
 
26
  !/demo/
27
- !/demo/*
 
 
24
  src/audiocraft
25
 
26
  !/demo/
27
+ !/demo/*
28
+ /demo/temp
app.py CHANGED
@@ -74,6 +74,8 @@ class WhisperxModel:
74
 
75
  def transcribe(self, audio_path):
76
  segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
 
 
77
  return self.align_model.align(segments, audio_path)
78
 
79
  @spaces.GPU(duration=120)
@@ -176,7 +178,7 @@ def align(seed, transcript, audio_path):
176
  if align_model is None:
177
  raise gr.Error("Align model not loaded")
178
  seed_everything(seed)
179
-
180
  fragments = align_segments(transcript, audio_path)
181
  segments = [{
182
  "start": float(fragment["begin"]),
@@ -185,7 +187,6 @@ def align(seed, transcript, audio_path):
185
  } for fragment in fragments["fragments"]]
186
  segments = align_model.align(segments, audio_path)
187
  state = get_transcribe_state(segments)
188
-
189
  return [
190
  state["transcript_with_start_time"], state["transcript_with_end_time"],
191
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
@@ -384,32 +385,32 @@ If disabled, you should write the target transcript yourself:</br>
384
  - In Edit mode write full prompt</br>
385
  """
386
 
387
- demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
388
 
389
  demo_text = {
390
  "TTS": {
391
  "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
392
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!"
393
  },
394
  "Edit": {
395
  "smart": "take over the stage for half an hour,",
396
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour, an esclavine of leather."
397
  },
398
  "Long TTS": {
399
  "smart": "You can run the model on a big text!\n"
400
- "Just write it line-by-line. Or sentence-by-sentence.\n"
401
  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
402
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
403
- "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
404
- "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
405
  }
406
  }
407
 
408
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
409
 
410
- demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414', '1.494 for 1.634', '1.695 his 1.835', '1.915 work 2.136', '2.196 and 2.297', '2.337 for 2.517', '2.557 his 2.678', '2.758 feats 3.019', '3.079 of 3.139', '3.2 strength, 3.561', '4.022 round 4.263', '4.303 his 4.444', '4.524 neck 4.705', '4.745 and 4.825', '4.905 over 5.086', '5.146 his 5.266', '5.307 shoulders, 5.768', '6.23 an 6.33', '6.531 esclavine 7.133', '7.213 of 7.293', '7.353 leather. 7.614']
411
 
412
- demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
413
 
414
 
415
  def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
@@ -445,7 +446,7 @@ def get_app():
445
 
446
  with gr.Row():
447
  with gr.Column(scale=2):
448
- input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
449
  with gr.Group():
450
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
451
  info="Use whisperx model to get the transcript. Fix and align it if necessary.")
@@ -473,16 +474,16 @@ def get_app():
473
  info="What to do with first and last word", visible=False)
474
 
475
  with gr.Group() as tts_mode_controls:
476
- prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[11], interactive=True)
477
- prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=3.600)
478
 
479
  with gr.Group(visible=False) as edit_mode_controls:
480
  with gr.Row():
481
- edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
482
- edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
483
  with gr.Row():
484
- edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.614, step=0.001, value=4.022)
485
- edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.614, step=0.001, value=5.768)
486
 
487
  run_btn = gr.Button(value="Run")
488
 
@@ -500,11 +501,11 @@ def get_app():
500
  with gr.Row():
501
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
502
  stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
503
- info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
504
- sample_batch_size = gr.Number(label="speech rate", value=3, precision=0,
505
  info="The higher the number, the faster the output will be. "
506
  "Under the hood, the model will generate this many samples and choose the shortest one. "
507
- "For giga330M_TTSEnhanced, 1 or 2 should be fine since the model is trained to do TTS.")
508
  seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
509
  kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
510
  info="set to 0 to use less VRAM, but with slower inference")
 
74
 
75
  def transcribe(self, audio_path):
76
  segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
77
+ for segment in segments:
78
+ segment['text'] = replace_numbers_with_words(segment['text'])
79
  return self.align_model.align(segments, audio_path)
80
 
81
  @spaces.GPU(duration=120)
 
178
  if align_model is None:
179
  raise gr.Error("Align model not loaded")
180
  seed_everything(seed)
181
+ transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ") # replace numbers with words, so that the phonemizer can do a better job
182
  fragments = align_segments(transcript, audio_path)
183
  segments = [{
184
  "start": float(fragment["begin"]),
 
187
  } for fragment in fragments["fragments"]]
188
  segments = align_model.align(segments, audio_path)
189
  state = get_transcribe_state(segments)
 
190
  return [
191
  state["transcript_with_start_time"], state["transcript_with_end_time"],
192
  gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
 
385
  - In Edit mode write full prompt</br>
386
  """
387
 
388
+ demo_original_transcript = "And again in two thousand and eight when the United States Central Bank, the Federal Reserve, printed over two trillion dollars."
389
 
390
  demo_text = {
391
  "TTS": {
392
  "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
393
+ "regular": "And again in two thousand and eight when the United States Central Bank, I cannot believe that the same model can also do text to speech synthesis too!"
394
  },
395
  "Edit": {
396
  "smart": "take over the stage for half an hour,",
397
+ "regular": "And again in two thousand and eight when the United States Central Bank, take over the stage for half an hour, printed over two trillion dollars."
398
  },
399
  "Long TTS": {
400
  "smart": "You can run the model on a big text!\n"
401
+ "Just write it line by line. Or sentence by sentence.\n"
402
  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
403
+ "regular": "And again in two thousand and eight when the United States Central Bank, You can run the model on a big text!\n"
404
+ "And again in two thousand and eight when the United States Central Bank, Just write it line by line. Or sentence by sentence.\n"
405
+ "And again in two thousand and eight when the United States Central Bank, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
406
  }
407
  }
408
 
409
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
410
 
411
+ demo_words = ['0.12 And 0.221', '0.261 again 0.561', '0.622 in 0.682', '0.742 two 0.922', '0.983 thousand 1.464', '1.504 and 1.584', '1.684 eight 1.865', '1.945 when 2.085', '2.125 the 2.206', '2.266 United 2.667', '2.707 States 2.968', '3.008 Central 3.349', '3.389 Bank, 3.649', '3.83 the 3.93', '4.01 Federal 4.451', '4.532 Reserve 5.113', '5.314 printed 5.674', '5.835 over 6.035', '6.176 two 6.517', '6.637 trillion 7.098', '7.118 dollars. 7.479']
412
 
413
+ demo_words_info = [{'word': 'And', 'start': 0.12, 'end': 0.221, 'score': 0.792}, {'word': 'again', 'start': 0.261, 'end': 0.561, 'score': 0.795}, {'word': 'in', 'start': 0.622, 'end': 0.682, 'score': 0.75}, {'word': 'two', 'start': 0.742, 'end': 0.922, 'score': 0.755}, {'word': 'thousand', 'start': 0.983, 'end': 1.464, 'score': 0.82}, {'word': 'and', 'start': 1.504, 'end': 1.584, 'score': 0.715}, {'word': 'eight', 'start': 1.684, 'end': 1.865, 'score': 0.885}, {'word': 'when', 'start': 1.945, 'end': 2.085, 'score': 0.987}, {'word': 'the', 'start': 2.125, 'end': 2.206, 'score': 0.833}, {'word': 'United', 'start': 2.266, 'end': 2.667, 'score': 0.818}, {'word': 'States', 'start': 2.707, 'end': 2.968, 'score': 0.842}, {'word': 'Central', 'start': 3.008, 'end': 3.349, 'score': 0.852}, {'word': 'Bank,', 'start': 3.389, 'end': 3.649, 'score': 0.98}, {'word': 'the', 'start': 3.83, 'end': 3.93, 'score': 0.996}, {'word': 'Federal', 'start': 4.01, 'end': 4.451, 'score': 0.795}, {'word': 'Reserve', 'start': 4.532, 'end': 5.113, 'score': 0.852}, {'word': 'printed', 'start': 5.314, 'end': 5.674, 'score': 0.785}, {'word': 'over', 'start': 5.835, 'end': 6.035, 'score': 0.84}, {'word': 'two', 'start': 6.176, 'end': 6.517, 'score': 0.757}, {'word': 'trillion', 'start': 6.637, 'end': 7.098, 'score': 0.796}, {'word': 'dollars.', 'start': 7.118, 'end': 7.479, 'score': 0.939}]
414
 
415
 
416
  def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
 
446
 
447
  with gr.Row():
448
  with gr.Column(scale=2):
449
+ input_audio = gr.Audio(value=f"{DEMO_PATH}/YOU1000000115_S0000252.wav", label="Input Audio", type="filepath", interactive=True)
450
  with gr.Group():
451
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
452
  info="Use whisperx model to get the transcript. Fix and align it if necessary.")
 
474
  info="What to do with first and last word", visible=False)
475
 
476
  with gr.Group() as tts_mode_controls:
477
+ prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
478
+ prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.86, step=0.001, value=3.675)
479
 
480
  with gr.Group(visible=False) as edit_mode_controls:
481
  with gr.Row():
482
+ edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[13], interactive=True)
483
+ edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[15], interactive=True)
484
  with gr.Row():
485
+ edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.86, step=0.001, value=3.83)
486
+ edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.86, step=0.001, value=5.113)
487
 
488
  run_btn = gr.Button(value="Run")
489
 
 
501
  with gr.Row():
502
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
503
  stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
504
+ info="if there are long silence in the generated audio, reduce the stop_repetition to 1 or 2. -1 = disabled")
505
+ sample_batch_size = gr.Number(label="speech rate", value=2, precision=0,
506
  info="The higher the number, the faster the output will be. "
507
  "Under the hood, the model will generate this many samples and choose the shortest one. "
508
+ "For TTSEnhanced models, 1~3 should be fine since the model is trained to do TTS.")
509
  seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
510
  kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
511
  info="set to 0 to use less VRAM, but with slower inference")
data/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (137 Bytes)
 
data/__pycache__/tokenizer.cpython-310.pyc DELETED
Binary file (4.83 kB)
 
demo/YOU1000000115_S0000252.wav ADDED
Binary file (252 kB). View file
 
demo/temp/84_121550_000074_000000.txt DELETED
@@ -1 +0,0 @@
1
- But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,
 
 
demo/temp/84_121550_000074_000000.wav DELETED
Binary file (508 kB)
 
demo/temp/mfa_alignments/84_121550_000074_000000.csv DELETED
@@ -1,109 +0,0 @@
1
- Begin,End,Label,Type,Speaker
2
- 0.03,0.18,but,words,temp
3
- 0.18,0.32,when,words,temp
4
- 0.32,0.48,i,words,temp
5
- 0.48,0.64,had,words,temp
6
- 0.64,1.19,approached,words,temp
7
- 1.22,1.58,so,words,temp
8
- 1.58,1.91,near,words,temp
9
- 1.91,2.07,to,words,temp
10
- 2.07,2.42,them,words,temp
11
- 2.53,2.61,the,words,temp
12
- 2.61,3.01,common,words,temp
13
- 3.05,3.62,object,words,temp
14
- 3.68,3.93,which,words,temp
15
- 3.93,4.02,the,words,temp
16
- 4.02,4.34,sense,words,temp
17
- 4.34,4.97,deceives,words,temp
18
- 5.04,5.54,lost,words,temp
19
- 5.54,6.0,not,words,temp
20
- 6.0,6.14,by,words,temp
21
- 6.14,6.67,distance,words,temp
22
- 6.79,7.05,any,words,temp
23
- 7.05,7.18,of,words,temp
24
- 7.18,7.34,its,words,temp
25
- 7.34,7.87,marks,words,temp
26
- 0.03,0.06,B,phones,temp
27
- 0.06,0.09,AH1,phones,temp
28
- 0.09,0.18,T,phones,temp
29
- 0.18,0.23,W,phones,temp
30
- 0.23,0.27,EH1,phones,temp
31
- 0.27,0.32,N,phones,temp
32
- 0.32,0.48,AY1,phones,temp
33
- 0.48,0.49,HH,phones,temp
34
- 0.49,0.6,AE1,phones,temp
35
- 0.6,0.64,D,phones,temp
36
- 0.64,0.7,AH0,phones,temp
37
- 0.7,0.83,P,phones,temp
38
- 0.83,0.88,R,phones,temp
39
- 0.88,0.99,OW1,phones,temp
40
- 0.99,1.12,CH,phones,temp
41
- 1.12,1.19,T,phones,temp
42
- 1.22,1.4,S,phones,temp
43
- 1.4,1.58,OW1,phones,temp
44
- 1.58,1.7,N,phones,temp
45
- 1.7,1.84,IH1,phones,temp
46
- 1.84,1.91,R,phones,temp
47
- 1.91,2.01,T,phones,temp
48
- 2.01,2.07,AH0,phones,temp
49
- 2.07,2.13,DH,phones,temp
50
- 2.13,2.3,EH1,phones,temp
51
- 2.3,2.42,M,phones,temp
52
- 2.53,2.55,DH,phones,temp
53
- 2.55,2.61,AH0,phones,temp
54
- 2.61,2.73,K,phones,temp
55
- 2.73,2.85,AA1,phones,temp
56
- 2.85,2.9,M,phones,temp
57
- 2.9,2.95,AH0,phones,temp
58
- 2.95,3.01,N,phones,temp
59
- 3.05,3.22,AA1,phones,temp
60
- 3.22,3.27,B,phones,temp
61
- 3.27,3.34,JH,phones,temp
62
- 3.34,3.48,EH0,phones,temp
63
- 3.48,3.54,K,phones,temp
64
- 3.54,3.62,T,phones,temp
65
- 3.68,3.69,HH,phones,temp
66
- 3.69,3.76,W,phones,temp
67
- 3.76,3.8,IH1,phones,temp
68
- 3.8,3.93,CH,phones,temp
69
- 3.93,3.95,DH,phones,temp
70
- 3.95,4.02,AH0,phones,temp
71
- 4.02,4.12,S,phones,temp
72
- 4.12,4.21,EH1,phones,temp
73
- 4.21,4.27,N,phones,temp
74
- 4.27,4.34,S,phones,temp
75
- 4.34,4.42,D,phones,temp
76
- 4.42,4.45,IH0,phones,temp
77
- 4.45,4.59,S,phones,temp
78
- 4.59,4.79,IY1,phones,temp
79
- 4.79,4.87,V,phones,temp
80
- 4.87,4.97,Z,phones,temp
81
- 5.04,5.12,L,phones,temp
82
- 5.12,5.33,AO1,phones,temp
83
- 5.33,5.42,S,phones,temp
84
- 5.42,5.54,T,phones,temp
85
- 5.54,5.7,N,phones,temp
86
- 5.7,5.89,AA1,phones,temp
87
- 5.89,6.0,T,phones,temp
88
- 6.0,6.05,B,phones,temp
89
- 6.05,6.14,AY1,phones,temp
90
- 6.14,6.24,D,phones,temp
91
- 6.24,6.3,IH1,phones,temp
92
- 6.3,6.38,S,phones,temp
93
- 6.38,6.45,T,phones,temp
94
- 6.45,6.51,AH0,phones,temp
95
- 6.51,6.57,N,phones,temp
96
- 6.57,6.67,S,phones,temp
97
- 6.79,6.89,EH1,phones,temp
98
- 6.89,6.95,N,phones,temp
99
- 6.95,7.05,IY0,phones,temp
100
- 7.05,7.13,AH0,phones,temp
101
- 7.13,7.18,V,phones,temp
102
- 7.18,7.22,IH0,phones,temp
103
- 7.22,7.29,T,phones,temp
104
- 7.29,7.34,S,phones,temp
105
- 7.34,7.39,M,phones,temp
106
- 7.39,7.5,AA1,phones,temp
107
- 7.5,7.58,R,phones,temp
108
- 7.58,7.7,K,phones,temp
109
- 7.7,7.87,S,phones,temp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/__pycache__/codebooks_patterns.cpython-310.pyc DELETED
Binary file (25 kB)
 
models/__pycache__/voicecraft.cpython-310.pyc DELETED
Binary file (40.1 kB)
 
models/modules/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (147 Bytes)
 
models/modules/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (145 Bytes)
 
models/modules/__pycache__/activation.cpython-310.pyc DELETED
Binary file (18.8 kB)
 
models/modules/__pycache__/activation.cpython-39.pyc DELETED
Binary file (18.8 kB)
 
models/modules/__pycache__/embedding.cpython-310.pyc DELETED
Binary file (3.08 kB)
 
models/modules/__pycache__/embedding.cpython-39.pyc DELETED
Binary file (3.05 kB)
 
models/modules/__pycache__/scaling.cpython-310.pyc DELETED
Binary file (40.4 kB)
 
models/modules/__pycache__/scaling.cpython-39.pyc DELETED
Binary file (40 kB)
 
models/modules/__pycache__/transformer.cpython-310.pyc DELETED
Binary file (16.1 kB)
 
models/modules/__pycache__/transformer.cpython-39.pyc DELETED
Binary file (15.8 kB)
 
models/modules/__pycache__/utils.cpython-310.pyc DELETED
Binary file (1.42 kB)
 
models/modules/__pycache__/utils.cpython-39.pyc DELETED
Binary file (1.42 kB)
 
models/modules/__pycache__/visualizer.cpython-39.pyc DELETED
Binary file (2.02 kB)