heisenberg3376 commited on
Commit
bfd1577
1 Parent(s): 1864517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -11,6 +11,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
  # load speech translation checkpoint
13
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 
 
14
 
15
  model = VitsModel.from_pretrained('facebook/mms-tts-rus').to(device)
16
  tokenizer = VitsTokenizer.from_pretrained('facebook/mms-tts-rus')
@@ -18,8 +20,8 @@ tokenizer = VitsTokenizer.from_pretrained('facebook/mms-tts-rus')
18
 
19
 
20
  def translate(audio):
21
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"russian"})
22
- return outputs["text"]
23
 
24
 
25
  def synthesise(text):
@@ -33,12 +35,14 @@ def speech_to_speech_translation(audio):
33
  translated_text = translate(audio)
34
  synthesised_speech = synthesise(translated_text)
35
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
36
- return 16000, synthesised_speech
37
 
38
 
39
  title = "Cascaded STST"
40
  description = """
41
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian"""
 
 
42
 
43
  demo = gr.Blocks()
44
 
 
11
 
12
  # load speech translation checkpoint
13
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
14
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
15
+
16
 
17
  model = VitsModel.from_pretrained('facebook/mms-tts-rus').to(device)
18
  tokenizer = VitsTokenizer.from_pretrained('facebook/mms-tts-rus')
 
20
 
21
 
22
  def translate(audio):
23
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
24
+ return translator(outputs['text'])[0]['translation_text']
25
 
26
 
27
  def synthesise(text):
 
35
  translated_text = translate(audio)
36
  synthesised_speech = synthesise(translated_text)
37
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
38
+ return 16000, synthesised_speech[0]
39
 
40
 
41
  title = "Cascaded STST"
42
  description = """
43
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian
44
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
45
+ """
46
 
47
  demo = gr.Blocks()
48