File size: 3,703 Bytes
95573ef
494f8dc
95573ef
 
 
 
da7da4a
494f8dc
 
95573ef
 
494f8dc
da7da4a
494f8dc
fc52adb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95573ef
 
 
494f8dc
95573ef
 
 
494f8dc
fc52adb
494f8dc
95573ef
 
 
 
 
834ee4c
 
 
 
 
 
 
 
 
494f8dc
834ee4c
 
494f8dc
 
 
834ee4c
494f8dc
834ee4c
494f8dc
834ee4c
494f8dc
 
 
 
 
 
 
 
 
 
 
 
 
 
a7111d3
834ee4c
 
 
494f8dc
834ee4c
 
494f8dc
 
 
834ee4c
 
 
494f8dc
834ee4c
494f8dc
834ee4c
494f8dc
 
 
 
 
 
 
 
 
 
 
 
 
 
95573ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
import numpy as np

device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
torch_dtype = torch.float16 if device != "cpu" else torch.float32

print("Device:", device)

model_id = "openai/whisper-large-v3"
#model_id = "openai/whisper-medium"

# model_id = "openai/whisper-large-v3"
# model_id = "openai/whisper-medium"

# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
# model.to(device)

# processor = AutoProcessor.from_pretrained(model_id)

# pipe_transcription = pipeline(
#     "automatic-speech-recognition",
#     model=model,
#     tokenizer=processor.tokenizer,
#     feature_extractor=processor.feature_extractor,
#     max_new_tokens=128,
#     chunk_length_s=30,
#     batch_size=16,
#     return_timestamps=True,
#     torch_dtype=torch_dtype,
#     device=device,
# )

pipe_transcription = pipeline("automatic-speech-recognition", model="pierreguillou/whisper-medium-french")
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark


def get_translation(text):
    return pipe_translate(text)[0]["translation_text"]

def get_transcript(voice):
    return get_translation(pipe_transcription(voice)["text"])#, generate_kwargs={"task": "translate", "language": "french"})["text"]

def get_audio(text):
    speech = pipe_tts(text)
    return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T

with gr.Blocks() as demo:    
    
    with gr.Tab("Texte (rapide)"):
        input_text = gr.Textbox(
                    label="Input text",
                    info="Your text",
                    lines=3,
                    placeholder="Écrire le texte à traduire",
                )
        translation_button = gr.Button("Traduire...")
        output_text = gr.Textbox(
                    label="Output text",
                    info="Your text",
                    lines=3,
                    placeholder="Votre traduction",
                )
        speech_button = gr.Button("Générer audio...")
        translation_button.click(
                get_translation,
                inputs=[
                    input_text
                ],
                outputs=[
                    output_text
                ],
        )
        speech_button.click(
                get_audio,
                inputs=[
                    output_text
                ],
                outputs=[
                    gr.Audio(label="Output")
                ],
        )
    with gr.Tab("Voix (plus lent)"):
        voice = gr.Audio(sources=["microphone"], type="filepath")

        translation_button = gr.Button("Traduire votre enregistrement !")
        output_text = gr.Textbox(
                    label="Texte traduit",
                    info="Votre texte",
                    lines=3,
                    placeholder="Votre traduction",
                )

        speech_button = gr.Button("Générer audio !")
        
        translation_button.click(
                get_transcript,
                inputs=[
                    voice
                ],
                outputs=[
                    output_text
                ],
        )
        speech_button.click(
                get_audio,
                inputs=[
                    output_text
                ],
                outputs=[
                    gr.Audio(label="Output")
                ],
        )
demo.launch()