Plachta commited on
Commit
4d6f78c
1 Parent(s): d69c31f

Replaced Encodec with Vocos

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
44
  text_collater = get_text_token_collater()
45
 
46
  device = torch.device("cpu")
47
- if torch.cuda.is_available():
48
- device = torch.device("cuda", 0)
49
 
50
  # VALL-E-X model
51
  model = VALLE(
@@ -141,17 +141,18 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
141
 
142
  if transcript_content == "":
143
  lang_pr, text_pr = transcribe_one(wav_pr, sr)
 
 
144
  else:
145
  lang_pr = langid.classify(str(transcript_content))[0]
146
  lang_token = lang2token[lang_pr]
 
147
  text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
148
  # tokenize audio
149
  encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
150
  audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
151
 
152
  # tokenize text
153
- lang_token = lang2token[lang_pr]
154
- text_pr = lang_token + text_pr + lang_token
155
  phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
156
  text_tokens, enroll_x_lens = text_collater(
157
  [
@@ -193,16 +194,20 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
193
 
194
  if transcript_content == "":
195
  lang_pr, text_pr = transcribe_one(wav_pr, sr)
 
 
196
  else:
197
  lang_pr = langid.classify(str(transcript_content))[0]
 
198
  lang_token = lang2token[lang_pr]
199
- text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
200
 
201
  if language == 'auto-detect':
202
  lang_token = lang2token[langid.classify(text)[0]]
203
  else:
204
  lang_token = langdropdown2token[language]
205
  lang = token2lang[lang_token]
 
206
  text = lang_token + text + lang_token
207
 
208
  if lang_pr not in ['ja', 'zh', 'en']:
@@ -223,8 +228,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
223
 
224
  enroll_x_lens = None
225
  if text_pr:
226
- lang_token = lang2token[lang_pr]
227
- text_pr = lang_token + text_pr + lang_token
228
  text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
229
  text_prompts, enroll_x_lens = text_collater(
230
  [
@@ -266,6 +269,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
266
  else:
267
  lang_token = langdropdown2token[language]
268
  lang = token2lang[lang_token]
 
269
  text = lang_token + text + lang_token
270
 
271
  # load prompt
 
44
  text_collater = get_text_token_collater()
45
 
46
  device = torch.device("cpu")
47
+ # if torch.cuda.is_available():
48
+ # device = torch.device("cuda", 0)
49
 
50
  # VALL-E-X model
51
  model = VALLE(
 
141
 
142
  if transcript_content == "":
143
  lang_pr, text_pr = transcribe_one(wav_pr, sr)
144
+ lang_token = lang2token[lang_pr]
145
+ text_pr = lang_token + text_pr + lang_token
146
  else:
147
  lang_pr = langid.classify(str(transcript_content))[0]
148
  lang_token = lang2token[lang_pr]
149
+ transcript_content = transcript_content.replace("\n", "")
150
  text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
151
  # tokenize audio
152
  encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
153
  audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
154
 
155
  # tokenize text
 
 
156
  phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
157
  text_tokens, enroll_x_lens = text_collater(
158
  [
 
194
 
195
  if transcript_content == "":
196
  lang_pr, text_pr = transcribe_one(wav_pr, sr)
197
+ lang_token = lang2token[lang_pr]
198
+ text_pr = lang_token + text_pr + lang_token
199
  else:
200
  lang_pr = langid.classify(str(transcript_content))[0]
201
+ text_pr = transcript_content.replace("\n", "")
202
  lang_token = lang2token[lang_pr]
203
+ text_pr = lang_token + text_pr + lang_token
204
 
205
  if language == 'auto-detect':
206
  lang_token = lang2token[langid.classify(text)[0]]
207
  else:
208
  lang_token = langdropdown2token[language]
209
  lang = token2lang[lang_token]
210
+ text = text.replace("\n", "")
211
  text = lang_token + text + lang_token
212
 
213
  if lang_pr not in ['ja', 'zh', 'en']:
 
228
 
229
  enroll_x_lens = None
230
  if text_pr:
 
 
231
  text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
232
  text_prompts, enroll_x_lens = text_collater(
233
  [
 
269
  else:
270
  lang_token = langdropdown2token[language]
271
  lang = token2lang[lang_token]
272
+ text = text.replace("\n", "")
273
  text = lang_token + text + lang_token
274
 
275
  # load prompt