Spaces:
Runtime error
Runtime error
File size: 3,249 Bytes
0668b67 6c8885f 0668b67 2914c65 0668b67 37b0e3a 0668b67 37b0e3a 261c279 0668b67 37b0e3a 0668b67 37b0e3a 0668b67 3d57e67 0668b67 a426fc1 0668b67 55070d4 0668b67 a2519fa 0668b67 e0b1de2 0668b67 d11d590 0668b67 517eb68 d11d590 0668b67 517eb68 0668b67 e0b1de2 0668b67 517eb68 49f5187 0668b67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import torch
import os
from transformers import pipeline, VitsModel, VitsTokenizer
import numpy as np
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
model = whisper.load_model("small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def inference(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
print(result.text)
return result.text
# Load Whisper-small
pipe = pipeline("automatic-speech-recognition",
model="openai/whisper-small",
device=device
)
# Load the model checkpoint and tokenizer
#model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
#tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
model = VitsModel.from_pretrained("facebook/mms-tts-fra")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
# Define a function to translate an audio, in english here
def translate(audio):
return inference(audio)
outputs = pipe(audio, max_new_tokens=256,
generate_kwargs={"task": "transcribe", "language": "english"})
return outputs["text"]
# Define function to generate the waveform output
def synthesise(text):
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
return outputs.audio[0]
# Define the pipeline
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (
synthesised_speech.numpy() * 32767).astype(np.int16)
return (16000, synthesised_speech)
def predict(transType, language, audio, audio_mic = None):
print("debug1:", audio,"debug2", audio_mic)
if not audio and audio_mic:
audio = audio_mic
audio = audio[1]
if transType == "Text":
return translate(audio), None
if transType == "Audio":
return "",speech_to_speech_translation(audio)
# Define the title etc
title = "Swedish STSOT (Speech To Speech Or Text)"
description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"
supportLangs = ["Swedish", "French (in training)"]
transTypes = ["Text", "Audio"]
examples = [
["Text", "Swedish", "./ex1.wav", None],
["Audio", "Swedish", "./ex2.wav", None]
]
demo = gr.Interface(
fn=predict,
inputs=[
gr.Radio(label="Choose your output format", choices=transTypes),
gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
gr.Audio(label="Import an audio", sources="upload", type="numpy"),
gr.Audio(label="Record an audio", sources="microphone", type="numpy"),
],
outputs=[
gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
],
title=title,
description=description,
article="",
examples=examples,
)
demo.launch() |