File size: 3,415 Bytes
0668b67
6c8885f
c6fcce4
0668b67
2914c65
0668b67
37b0e3a
0668b67
37b0e3a
261c279
0668b67
37b0e3a
 
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
 
 
 
 
 
0668b67
4c84112
 
 
 
 
0668b67
58cb1a8
 
0668b67
 
 
 
58cb1a8
 
c6fcce4
58cb1a8
0668b67
 
 
 
 
 
 
 
1eac7b5
0668b67
 
 
 
 
 
 
 
 
 
a7a78fa
0668b67
 
55070d4
0668b67
 
c6fcce4
0668b67
e0b1de2
0668b67
a7a78fa
0668b67
 
 
 
 
 
 
 
517eb68
5e0755e
 
 
 
 
 
0668b67
 
 
 
 
c6fcce4
 
36b2c0f
0668b67
 
e0b1de2
0668b67
 
 
 
517eb68
49f5187
0668b67
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch
import os
from transformers import pipeline, VitsModel, VitsTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
import numpy as np
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper

model = whisper.load_model("small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

def inference(audio):
    audio = whisper.load_audio(audio)
    print("loading finished")
    audio = whisper.pad_or_trim(audio)
    print("audio trimed")
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    print("spectro finished")
    _, probs = model.detect_language(mel)
    print("lang detected")
    options = whisper.DecodingOptions(fp16 = False)
    print("options decoded")
    result = whisper.decode(model, mel, options)
    
    print(result.text)
    return result.text

    
# Load Whisper-small
# pipe = pipeline("automatic-speech-recognition",
#                 model="openai/whisper-small",
#                 device=device
# )
pipe = pipeline(model="Sleepyp00/whisper-small-Swedish") 

model2 = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")


# Define a function to translate an audio, in english here
def translate(audio):
    # return inference(audio)
    outputs = pipe(audio, max_new_tokens=256,
                   generate_kwargs={"task": "translate"})
    return outputs["text"]


# Define function to generate the waveform output
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad(): 
      outputs = model2(input_ids)
    
    return outputs.audio[0]


# Define the pipeline
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (
        synthesised_speech.numpy() * 32767).astype(np.int16)
    return [translated_text, (16000, synthesised_speech)]

def predict(transType, language, audio, audio_mic = None):
        print("debug1:", audio,"debug2", audio_mic)
        if not audio and audio_mic:
            audio = audio_mic
        
        if transType == "Text":
            return translate(audio), None
        if transType == "Audio":
            return speech_to_speech_translation(audio)
            
# Define the title etc
title = "Swedish STSOT (Speech To Speech Or Text)"
description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"


supportLangs = ["Swedish", "French (in training)"]
transTypes = ["Text", "Audio"]

#examples = [
#    ["Text", "Swedish", "./ex1.wav", None],
#    ["Audio", "Swedish", "./ex2.wav", None]
#]

examples =[]
demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Radio(label="Choose your output format", choices=transTypes),
        gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
        gr.Audio(label="Import an audio", sources="upload", type="filepath"),
        #gr.Audio(label="Import an audio", sources="upload", type="numpy"),
        gr.Audio(label="Record an audio", sources="microphone", type="filepath"),
    ],
    outputs=[
        gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
    ],
    title=title,
    description=description,
    article="",
    examples=examples,
)


demo.launch()