Spaces:
Runtime error
Runtime error
File size: 3,415 Bytes
0668b67 6c8885f c6fcce4 0668b67 2914c65 0668b67 37b0e3a 0668b67 37b0e3a 261c279 0668b67 37b0e3a a7a78fa 37b0e3a a7a78fa 37b0e3a a7a78fa 37b0e3a a7a78fa 37b0e3a a7a78fa 37b0e3a 0668b67 4c84112 0668b67 58cb1a8 0668b67 58cb1a8 c6fcce4 58cb1a8 0668b67 1eac7b5 0668b67 a7a78fa 0668b67 55070d4 0668b67 c6fcce4 0668b67 e0b1de2 0668b67 a7a78fa 0668b67 517eb68 5e0755e 0668b67 c6fcce4 36b2c0f 0668b67 e0b1de2 0668b67 517eb68 49f5187 0668b67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import torch
import os
from transformers import pipeline, VitsModel, VitsTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
import numpy as np
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
model = whisper.load_model("small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def inference(audio):
audio = whisper.load_audio(audio)
print("loading finished")
audio = whisper.pad_or_trim(audio)
print("audio trimed")
mel = whisper.log_mel_spectrogram(audio).to(model.device)
print("spectro finished")
_, probs = model.detect_language(mel)
print("lang detected")
options = whisper.DecodingOptions(fp16 = False)
print("options decoded")
result = whisper.decode(model, mel, options)
print(result.text)
return result.text
# Load Whisper-small
# pipe = pipeline("automatic-speech-recognition",
# model="openai/whisper-small",
# device=device
# )
pipe = pipeline(model="Sleepyp00/whisper-small-Swedish")
model2 = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
# Define a function to translate an audio, in english here
def translate(audio):
# return inference(audio)
outputs = pipe(audio, max_new_tokens=256,
generate_kwargs={"task": "translate"})
return outputs["text"]
# Define function to generate the waveform output
def synthesise(text):
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model2(input_ids)
return outputs.audio[0]
# Define the pipeline
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (
synthesised_speech.numpy() * 32767).astype(np.int16)
return [translated_text, (16000, synthesised_speech)]
def predict(transType, language, audio, audio_mic = None):
print("debug1:", audio,"debug2", audio_mic)
if not audio and audio_mic:
audio = audio_mic
if transType == "Text":
return translate(audio), None
if transType == "Audio":
return speech_to_speech_translation(audio)
# Define the title etc
title = "Swedish STSOT (Speech To Speech Or Text)"
description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"
supportLangs = ["Swedish", "French (in training)"]
transTypes = ["Text", "Audio"]
#examples = [
# ["Text", "Swedish", "./ex1.wav", None],
# ["Audio", "Swedish", "./ex2.wav", None]
#]
examples =[]
demo = gr.Interface(
fn=predict,
inputs=[
gr.Radio(label="Choose your output format", choices=transTypes),
gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
gr.Audio(label="Import an audio", sources="upload", type="filepath"),
#gr.Audio(label="Import an audio", sources="upload", type="numpy"),
gr.Audio(label="Record an audio", sources="microphone", type="filepath"),
],
outputs=[
gr.Text(label="Text translation"),gr.Audio(label="Audio translation",type = "numpy")
],
title=title,
description=description,
article="",
examples=examples,
)
demo.launch() |