File size: 2,998 Bytes
f146c37
 
 
 
 
 
 
 
 
 
 
 
 
 
d347764
 
 
 
 
f146c37
d347764
 
 
f146c37
 
 
 
d347764
f146c37
 
d347764
f146c37
 
d347764
f146c37
d347764
f146c37
 
 
 
 
 
 
d347764
 
 
 
 
 
f146c37
 
 
d347764
f146c37
 
 
 
d347764
 
 
f146c37
d347764
 
f146c37
d347764
f146c37
f805e49
f146c37
 
 
f805e49
 
c737803
 
 
d347764
f146c37
d347764
f805e49
 
d347764
c737803
 
 
f146c37
c737803
 
 
 
 
 
f146c37
c737803
d347764
f146c37
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
"""ML_task3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1DfK6fjkAd9RjVx3MUGfDtAOulvEenk0E
"""

!pip install gradio

!pip install datasets
!pip install transformers

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# распознавание речи
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)

!pip -q install sentencepiece

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small")

translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator_ru = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

from transformers import VitsModel, VitsTokenizer

model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

def translator_mul_ru(text):

    translation  = translator_ru(translator_en(text)[0]['translation_text'])
    return translation[0]['translation_text']

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

def synthesise(text):
    translated_text = translator_mul_ru(text)
    inputs = tokenizer(translated_text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    speech = outputs["waveform"]
    return speech.cpu()

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    print(translated_text)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]

title = "Speech-to-Speech Translation"
description = """
* Выбранная ASR модель - https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56
* Перевод текста на русский с помощью модели https://huggingface.co/Helsinki-NLP/opus-mt-mul-en
* Синтез речи на русском языке с помощью модели https://huggingface.co/facebook/mms-tts-rus
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])

demo.launch()