gpt-4-turbo-chatbot / mattgpt-text-to-audio.py
mhanagan's picture
Upload folder using huggingface_hub
a63d350 verified
#!/usr/bin/env rye run python
import gradio as gr
import time
from pathlib import Path
from openai import OpenAI
import io
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Access the API key using the variable name defined in the .env file
api_key = os.getenv("OPENAI_API_KEY")
speech_file_path = Path(__file__).parent / "speech.mp3"
def main() -> None:
# Prompting user to input the text they want to convert to speech
user_input = input("Please enter the text you want to convert to speech: ")
# Stream the user's input text to speakers
stream_to_speakers(user_input)
# Create text-to-speech audio file with user input
with openai.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
input=user_input,
) as response:
response.stream_to_file(speech_file_path)
# Create transcription from audio file
transcription = openai.audio.transcriptions.create(
model="whisper-1",
file=speech_file_path,
)
print(transcription.text)
# Create translation from audio file
translation = openai.audio.translations.create(
model="whisper-1",
file=speech_file_path,
)
print(translation.text)
def stream_to_speakers(user_input: str) -> None:
import pyaudio
import io # We'll need the 'io' module
player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
start_time = time.time()
with openai.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
response_format="pcm",
input=user_input,
) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
# Create an in-memory buffer to hold audio chunks
audio_buffer = io.BytesIO()
for chunk in response.iter_bytes(chunk_size=1024):
audio_buffer.write(chunk) # Write chunks to buffer
player_stream.write(audio_buffer.getvalue()) # Play from buffer
audio_buffer.seek(0) # Reset buffer position
audio_buffer.truncate() # Clear the buffer
print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
if __name__ == "__main__":
main()
# Create a Gradio interface
iface = gr.Interface(
fn=stream_to_speakers,
inputs="text",
outputs="audio",
title="MattGPT Text to Speech",
description="Enter text and hear it converted to speech."
)
# Launch the interface
iface.launch(share=True)