Spaces:

frogcho123
/

s2s

Build error

File size: 2,144 Bytes

36bec1c
4adb977
ec72da9
bbee8bf
 
bc7920f
bbee8bf
0ce7006
b3ba25a
400111e
a96b473
 
 
c3f4b33
a96b473
 
bbee8bf
 
 
 
ec72da9
 
a66dfeb
bbee8bf
ec72da9
 
bbee8bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc7920f
bbee8bf
 
 
fb0bdd0
bbee8bf
53c1f5b
bbee8bf
 
 
09b2c6d
45b71c9
bbee8bf
 
 
 
 
 
 
 
 
 
45b71c9

import gradio as gr
from gradio import components
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import soundfile as sf
import os
import numpy as np

def translate_speech_to_speech(input_tuple):
    input_audio, sample_rate = input_tuple
    
    # Save the input audio to a temporary file
    input_file = "input_audio.wav"
    sf.write(input_file, input_audio, sample_rate)  # use the sample rate from Gradio

    
    # Language detection and translation code from the first code snippet
    model = whisper.load_model("base")
    audio = whisper.load_audio(input_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    
    text = result.text
    lang = max(probs, key=probs.get)
    
    # Translation code from the first code snippet
    to_lang = 'ru'
    tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
    model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
    
    tokenizer.src_lang = lang
    encoded_bg = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_bg)
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    
    # Text-to-speech (TTS) code from the first code snippet
    tts = gTTS(text=translated_text, lang=to_lang)
    output_file = "translated_speech.wav"
    tts.save(output_file)
    
    # Load the translated audio and return as an output
    translated_audio, _ = sf.read(output_file, dtype="int16")
    
    return translated_audio

title = "Speech-to-Speech Translator"

input_audio = gr.inputs.Audio(source="microphone")
output_audio = gr.outputs.Audio(type="numpy")

stt_demo = gr.Interface(
    fn=translate_speech_to_speech,
    inputs=input_audio,
    outputs=output_audio,
    title=title,
    description="Speak in any language, and the translator will convert it to speech in the target language.",
)

if __name__ == "__main__":
    stt_demo.launch()