Spaces:
Build error
Build error
File size: 5,474 Bytes
5eeb931 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import whisper
from gtts import gTTS
from dotenv import load_dotenv
import openai
import streamlit as st
import tempfile
from pydub import AudioSegment
import wave
import pyaudio
# Load environment variables
load_dotenv()
# Initialize Whisper Model
@st.cache_resource
def load_whisper_model():
return whisper.load_model("medium")
whisper_model = load_whisper_model()
# Streamlit UI
st.title("Conversational AI with Speech-to-Speech Response")
st.write("Upload an audio file or record your voice to start the process.")
# Add a sidebar for interaction options
interaction_mode = st.sidebar.selectbox(
"Choose Interaction Mode:", ["Record Voice", "Upload Audio"]
)
# Record Voice Functionality using pydub and pyaudio
def record_audio(filename, duration=5, sample_rate=44100):
st.info(f"Recording for {duration} seconds...")
p = pyaudio.PyAudio()
# Open a stream for recording
stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
frames = []
for _ in range(0, int(sample_rate / 1024 * duration)):
data = stream.read(1024)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
# Save the recorded frames as a WAV file
with wave.open(filename, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(sample_rate)
wf.writeframes(b''.join(frames))
st.success("Recording complete!")
# Process Audio Input
if interaction_mode == "Record Voice":
duration = st.slider("Select Recording Duration (seconds):", min_value=10, max_value=120, step=10)
record_btn = st.button("Start Recording")
if record_btn:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
record_audio(temp_audio.name, duration=duration)
temp_audio_path = temp_audio.name
st.audio(temp_audio_path, format="audio/wav")
elif interaction_mode == "Upload Audio":
uploaded_file = st.file_uploader("Upload your audio file (MP3/WAV)", type=["mp3", "wav"])
if uploaded_file is not None:
# Save the uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
temp_audio.write(uploaded_file.read())
temp_audio_path = temp_audio.name
st.audio(temp_audio_path, format="audio/mp3")
# Process and Transcribe Audio
if 'temp_audio_path' in locals() and temp_audio_path is not None:
st.write("Processing the audio file...")
# If the uploaded or recorded audio is in MP3 format, convert it to WAV for Whisper
if temp_audio_path.endswith(".mp3"):
audio = AudioSegment.from_mp3(temp_audio_path)
temp_audio_path = temp_audio_path.replace(".mp3", ".wav")
audio.export(temp_audio_path, format="wav")
# Transcribe audio using Whisper
result = whisper_model.transcribe(temp_audio_path)
user_text = result["text"]
st.write("Transcribed Text:", user_text)
# Generate AI Response
st.write("Generating a conversational response...")
client = openai.OpenAI(
api_key=os.environ.get("SAMBANOVA_API_KEY"),
base_url="https://api.sambanova.ai/v1",
)
response = client.chat.completions.create(
model='Meta-Llama-3.1-8B-Instruct',
messages=[
{"role": "system", "content": (
"You are a kind, empathetic, and intelligent assistant capable of meaningful conversations and emotional support. "
"Your primary goals are: "
"1. To engage in casual, friendly, and supportive conversations when the user seeks companionship or emotional relief. "
"2. To adapt your tone and responses to match the user's mood, providing warmth and encouragement if they seem distressed or seeking emotional support. "
"3. To answer questions accurately and provide explanations when asked, adjusting the depth and length of your answers based on the user's needs. "
"4. To maintain a positive and non-judgmental tone, offering helpful advice or lighthearted dialogue when appropriate. "
"5. To ensure the user feels heard, understood, and valued during every interaction. "
"If the user does not ask a question, keep the conversation engaging and meaningful by responding thoughtfully or with light humor where appropriate."
)},
{"role": "user", "content": user_text},
],
temperature=0.1,
top_p=0.1,
)
answer = response.choices[0].message.content
st.write("Response:", answer)
# Convert response text to speech using gTTS
st.write("Converting the response to speech...")
tts = gTTS(text=answer, slow=False)
response_audio_path = "final_response.mp3"
tts.save(response_audio_path)
# Play and download the response MP3
st.audio(response_audio_path, format="audio/mp3")
st.download_button(
label="Download the Response",
data=open(response_audio_path, "rb"),
file_name="final_response.mp3",
mime="audio/mpeg",
)
# Clean up temporary files
os.remove(temp_audio_path)
os.remove(response_audio_path)
|