File size: 3,417 Bytes
95573ef
494f8dc
95573ef
 
 
 
494f8dc
 
 
 
95573ef
 
494f8dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95573ef
 
 
494f8dc
95573ef
 
 
494f8dc
 
 
95573ef
 
 
 
 
494f8dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95573ef
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
import numpy as np

device = "cuda:0" if torch.cuda.is_available() else "cpu"

device = "cpu"
torch_dtype = torch.float16 if device != "cpu" else torch.float32

print("Device:", device)

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe_transcription = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark


def get_translation(text):
    return pipe_translate(text)[0]["translation_text"]

def get_transcript(voice):
    return pipe_transcription(voice, generate_kwargs={"task": "translate", "language": "french"})["text"]

def get_audio(text):
    speech = pipe_tts(text)
    return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T

with gr.Blocks() as demo:    
    with gr.Tab("Voix (plus lent)"):
        voice = gr.Audio(sources=["microphone"], type="filepath")

        translation_button = gr.Button("Traduire votre enregistrement !")
        output_text = gr.Textbox(
                    label="Texte traduit",
                    info="Votre texte",
                    lines=3,
                    placeholder="Votre traduction",
                )

        speech_button = gr.Button("Générer audio !")
        
        translation_button.click(
                get_transcript,
                inputs=[
                    voice
                ],
                outputs=[
                    output_text
                ],
        )
        speech_button.click(
                get_audio,
                inputs=[
                    output_text
                ],
                outputs=[
                    gr.Audio(label="Output")
                ],
        )
    with gr.Tab("Texte (rapide)"):
        input_text = gr.Textbox(
                    label="Input text",
                    info="Your text",
                    lines=3,
                    placeholder="Écrire le texte à traduire",
                )
        translation_button = gr.Button("Traduire...")
        output_text = gr.Textbox(
                    label="Output text",
                    info="Your text",
                    lines=3,
                    placeholder="Votre traduction",
                )
        speech_button = gr.Button("Générer audio...")
        translation_button.click(
                get_translation,
                inputs=[
                    input_text
                ],
                outputs=[
                    output_text
                ],
        )
        speech_button.click(
                get_audio,
                inputs=[
                    output_text
                ],
                outputs=[
                    gr.Audio(label="Output")
                ],
        )

demo.launch()