Spaces:
Build error
Build error
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText | |
| import soundfile as sf | |
| import tempfile | |
| import os | |
| # Check if CUDA is available, otherwise use CPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load SpeechT5 models and processor | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr") | |
| asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device) | |
| tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device) | |
| # Function to convert speech to text | |
| def speech_to_text(audio): | |
| inputs = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to(device) | |
| with torch.no_grad(): | |
| logits = asr_model(inputs).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids)[0] | |
| return transcription | |
| # Function to convert text to speech | |
| def text_to_speech(text): | |
| inputs = processor(text, return_tensors="pt").input_ids.to(device) | |
| with torch.no_grad(): | |
| speech = tts_model.generate_speech(inputs) | |
| return speech | |
| # Gradio demo | |
| def demo(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Voice Chatbot") | |
| gr.Markdown("Simply speak into the microphone and get an audio response.") | |
| audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak") | |
| audio_output = gr.Audio(label="Response", autoplay=True) | |
| transcript_display = gr.Textbox(label="Conversation") | |
| def process_audio(audio): | |
| if audio is None: | |
| return None, "No audio detected." | |
| # Convert audio to the correct format | |
| sample_rate, audio_data = audio | |
| audio_data = audio_data.flatten().astype(np.float32) / 32768.0 # Normalize to [-1.0, 1.0] | |
| # Speech-to-text | |
| transcript = speech_to_text({"array": audio_data, "sampling_rate": sample_rate}) | |
| print(f"Transcribed: {transcript}") | |
| # Generate response (for simplicity, echo the transcript) | |
| response_text = transcript | |
| print(f"Response: {response_text}") | |
| # Text-to-speech | |
| response_audio = text_to_speech(response_text) | |
| # Save the response audio to a temporary file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| sf.write(temp_file.name, response_audio.cpu().numpy(), 16000) | |
| temp_filename = temp_file.name | |
| # Read the audio file | |
| audio_data, sample_rate = sf.read(temp_filename) | |
| # Clean up the temporary file | |
| os.unlink(temp_filename) | |
| return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}" | |
| audio_input.change(process_audio, | |
| inputs=[audio_input], | |
| outputs=[audio_output, transcript_display]) | |
| clear_btn = gr.Button("Clear Conversation") | |
| clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display]) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| demo() |