File size: 3,249 Bytes
0668b67
6c8885f
0668b67
 
2914c65
0668b67
37b0e3a
0668b67
37b0e3a
261c279
0668b67
37b0e3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0668b67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37b0e3a
0668b67
3d57e67
0668b67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a426fc1
0668b67
 
55070d4
0668b67
 
a2519fa
0668b67
e0b1de2
0668b67
d11d590
0668b67
 
 
 
 
 
 
 
517eb68
d11d590
 
 
 
0668b67
 
 
 
 
517eb68
 
0668b67
 
e0b1de2
0668b67
 
 
 
517eb68
49f5187
0668b67
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
import os
from transformers import pipeline, VitsModel, VitsTokenizer
import numpy as np
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper

model = whisper.load_model("small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

def inference(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)
    
    print(result.text)
    return result.text

    
# Load Whisper-small
pipe = pipeline("automatic-speech-recognition",
                model="openai/whisper-small",
                device=device
)

# Load the model checkpoint and tokenizer
#model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
#tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
model = VitsModel.from_pretrained("facebook/mms-tts-fra")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


# Define a function to translate an audio, in english here
def translate(audio):
    return inference(audio)
    outputs = pipe(audio, max_new_tokens=256,
                   generate_kwargs={"task": "transcribe", "language": "english"})
    return outputs["text"]


# Define function to generate the waveform output
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad(): 
      outputs = model(input_ids)
    
    return outputs.audio[0]


# Define the pipeline
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (
        synthesised_speech.numpy() * 32767).astype(np.int16)
    return (16000, synthesised_speech)

def predict(transType, language, audio, audio_mic = None):
        print("debug1:", audio,"debug2", audio_mic)
        if not audio and audio_mic:
            audio = audio_mic
        audio = audio[1]
        if transType == "Text":
            return translate(audio), None
        if transType == "Audio":
            return "",speech_to_speech_translation(audio)
            
# Define the title etc
title = "Swedish STSOT (Speech To Speech Or Text)"
description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"


supportLangs = ["Swedish", "French (in training)"]
transTypes = ["Text", "Audio"]

examples = [
    ["Text", "Swedish", "./ex1.wav", None],
    ["Audio", "Swedish", "./ex2.wav", None]
]
demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Radio(label="Choose your output format", choices=transTypes),
        gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
        gr.Audio(label="Import an audio", sources="upload", type="numpy"),
        gr.Audio(label="Record an audio", sources="microphone", type="numpy"),
    ],
    outputs=[
        gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
    ],
    title=title,
    description=description,
    article="",
    examples=examples,
)


demo.launch()