arnavmehta7 commited on
Commit
b0c547a
β€’
1 Parent(s): 26fdaed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -21
app.py CHANGED
@@ -4,16 +4,14 @@ import torch
4
  import librosa
5
  from pathlib import Path
6
  import tempfile, torchaudio
7
- # from faster_whisper import WhisperModel
8
  from transformers import pipeline
9
  from uuid import uuid4
10
 
11
  # Load the MARS5 model
12
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
13
- # asr_model = WhisperModel("small", device="cpu", compute_type="int8")
14
  asr_model = pipeline(
15
  "automatic-speech-recognition",
16
- model="openai/whisper-medium",
17
  chunk_length_s=30,
18
  device=torch.device("cuda:0"),
19
  )
@@ -24,15 +22,16 @@ def transcribe_file(f: str) -> str:
24
  return " ".join([prediction["text"] for prediction in predictions])
25
 
26
  # Function to process the text and audio input and generate the synthesized output
27
- def synthesize(text, audio_file, transcript):
28
- audio_file = Path(audio_file)
29
- temp_file = f"{uuid4()}.{audio_file.suffix}"
 
30
 
31
- # copying the audio_file
32
- with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
33
- dst.write(src.read())
34
 
35
- audio_file = temp_file
36
 
37
  print(f">>>>> synthesizing! audio_file: {audio_file}")
38
  if not transcript:
@@ -43,11 +42,10 @@ def synthesize(text, audio_file, transcript):
43
  wav = torch.from_numpy(wav)
44
 
45
  # Define the configuration for the TTS model
46
- deep_clone = True
47
- cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3)
48
 
49
  # Generate the synthesized audio
50
- ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg)
51
 
52
  # Save the synthesized audio to a temporary file
53
  output_path = Path(tempfile.mktemp(suffix=".wav"))
@@ -73,7 +71,7 @@ with gr.Blocks() as demo:
73
  text = gr.Textbox(label="Text to synthesize")
74
  audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
75
 
76
- generate_btn = gr.Button(label="Generate Synthesized Audio")
77
 
78
  with gr.Accordion("Advanced Settings", open=False):
79
  gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
@@ -86,18 +84,77 @@ with gr.Blocks() as demo:
86
  presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
87
  rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
88
  nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
89
- meta_n = gr.Slider(minimum=1, maximum=10, step=1, label="meta_n", value=2, interactive=False)
90
  deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
91
-
92
- dummy = gr.Number(label='Example number', visible=False)
93
-
94
  output = gr.Audio(label="Synthesized Audio", type="filepath")
95
- def on_click(text, audio_file, prompt_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
97
- of = synthesize(text, audio_file, prompt_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  print(f">>>> output file: {of}")
99
  return of
100
 
101
- generate_btn.click(on_click, inputs=[text, audio_file, prompt_text], outputs=[output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  demo.launch(share=False)
 
4
  import librosa
5
  from pathlib import Path
6
  import tempfile, torchaudio
 
7
  from transformers import pipeline
8
  from uuid import uuid4
9
 
10
  # Load the MARS5 model
11
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
 
12
  asr_model = pipeline(
13
  "automatic-speech-recognition",
14
+ model="openai/whisper-tiny",
15
  chunk_length_s=30,
16
  device=torch.device("cuda:0"),
17
  )
 
22
  return " ".join([prediction["text"] for prediction in predictions])
23
 
24
  # Function to process the text and audio input and generate the synthesized output
25
+ def synthesize(text, audio_file, transcript, kwargs_dict):
26
+ print(f">>>>>>> Kwargs dict: {kwargs_dict}")
27
+ # audio_file = Path(audio_file)
28
+ # temp_file = f"{uuid4()}.{audio_file.suffix}"
29
 
30
+ # # copying the audio_file
31
+ # with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
32
+ # dst.write(src.read())
33
 
34
+ # audio_file = temp_file
35
 
36
  print(f">>>>> synthesizing! audio_file: {audio_file}")
37
  if not transcript:
 
42
  wav = torch.from_numpy(wav)
43
 
44
  # Define the configuration for the TTS model
45
+ cfg = config_class(**kwargs_dict)
 
46
 
47
  # Generate the synthesized audio
48
+ ar_codes, wav_out = mars5.tts(text, wav, transcript.strip(), cfg=cfg)
49
 
50
  # Save the synthesized audio to a temporary file
51
  output_path = Path(tempfile.mktemp(suffix=".wav"))
 
71
  text = gr.Textbox(label="Text to synthesize")
72
  audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
73
 
74
+ generate_btn = gr.Button("Generate Synthesized Audio")
75
 
76
  with gr.Accordion("Advanced Settings", open=False):
77
  gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
 
84
  presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
85
  rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
86
  nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
 
87
  deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
88
+
 
 
89
  output = gr.Audio(label="Synthesized Audio", type="filepath")
90
+ def on_click(
91
+ text,
92
+ audio_file,
93
+ prompt_text,
94
+ temperature,
95
+ top_k,
96
+ top_p,
97
+ typical_p,
98
+ freq_penalty,
99
+ presence_penalty,
100
+ rep_penalty_window,
101
+ nar_guidance_w,
102
+ deep_clone
103
+ ):
104
  print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
105
+ of = synthesize(
106
+ text,
107
+ audio_file,
108
+ prompt_text,
109
+ {
110
+ 'temperature': temperature,
111
+ 'top_k': top_k,
112
+ 'top_p': top_p,
113
+ 'typical_p': typical_p,
114
+ 'freq_penalty': freq_penalty,
115
+ 'presence_penalty': presence_penalty,
116
+ 'rep_penalty_window': rep_penalty_window,
117
+ 'nar_guidance_w': nar_guidance_w,
118
+ 'deep_clone': deep_clone
119
+ }
120
+ )
121
  print(f">>>> output file: {of}")
122
  return of
123
 
124
+ generate_btn.click(
125
+ on_click,
126
+ inputs=[
127
+ text,
128
+ audio_file,
129
+ prompt_text,
130
+ temperature,
131
+ top_k,
132
+ top_p,
133
+ typical_p,
134
+ freq_penalty,
135
+ presence_penalty,
136
+ rep_penalty_window,
137
+ nar_guidance_w,
138
+ deep_clone
139
+ ],
140
+ outputs=[output]
141
+ )
142
+
143
+ gr.Markdown("### Examples")
144
+
145
+ # Add examples
146
+ defaults = [0.8, -1, 0.2, 1.0, 2.6, 0.4, 100, 3, True]
147
+ examples = [
148
+ ["Today is a wonderful day!", "female_speaker_1.flac", "People look, but no one ever finds it.", *defaults],
149
+ ["You guys need to figure this out.", "male_speaker_1.flac", "Ask her to bring these things with her from the store.", *defaults]
150
+ ]
151
+
152
+ gr.Examples(
153
+ examples=examples,
154
+ inputs=[text, audio_file, prompt_text, temperature, top_k, top_p, typical_p, freq_penalty, presence_penalty, rep_penalty_window, nar_guidance_w, deep_clone],
155
+ outputs=[output],
156
+ cache_examples=False,
157
+ fn=on_click
158
+ )
159
 
160
  demo.launch(share=False)