File size: 5,474 Bytes
5eeb931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import whisper
from gtts import gTTS
from dotenv import load_dotenv
import openai
import streamlit as st
import tempfile
from pydub import AudioSegment
import wave
import pyaudio

# Load environment variables
load_dotenv()

# Initialize Whisper Model
@st.cache_resource
def load_whisper_model():
    return whisper.load_model("medium")

whisper_model = load_whisper_model()

# Streamlit UI
st.title("Conversational AI with Speech-to-Speech Response")
st.write("Upload an audio file or record your voice to start the process.")

# Add a sidebar for interaction options
interaction_mode = st.sidebar.selectbox(
    "Choose Interaction Mode:", ["Record Voice", "Upload Audio"]
)

# Record Voice Functionality using pydub and pyaudio
def record_audio(filename, duration=5, sample_rate=44100):
    st.info(f"Recording for {duration} seconds...")
    p = pyaudio.PyAudio()
    
    # Open a stream for recording
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    frames = []
    for _ in range(0, int(sample_rate / 1024 * duration)):
        data = stream.read(1024)
        frames.append(data)
    
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Save the recorded frames as a WAV file
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

    st.success("Recording complete!")

# Process Audio Input
if interaction_mode == "Record Voice":
    duration = st.slider("Select Recording Duration (seconds):", min_value=10, max_value=120, step=10)
    record_btn = st.button("Start Recording")
    
    if record_btn:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            record_audio(temp_audio.name, duration=duration)
            temp_audio_path = temp_audio.name
            st.audio(temp_audio_path, format="audio/wav")
elif interaction_mode == "Upload Audio":
    uploaded_file = st.file_uploader("Upload your audio file (MP3/WAV)", type=["mp3", "wav"])
    if uploaded_file is not None:
        # Save the uploaded file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
            temp_audio.write(uploaded_file.read())
            temp_audio_path = temp_audio.name
            st.audio(temp_audio_path, format="audio/mp3")

# Process and Transcribe Audio
if 'temp_audio_path' in locals() and temp_audio_path is not None:
    st.write("Processing the audio file...")
    
    # If the uploaded or recorded audio is in MP3 format, convert it to WAV for Whisper
    if temp_audio_path.endswith(".mp3"):
        audio = AudioSegment.from_mp3(temp_audio_path)
        temp_audio_path = temp_audio_path.replace(".mp3", ".wav")
        audio.export(temp_audio_path, format="wav")

    # Transcribe audio using Whisper
    result = whisper_model.transcribe(temp_audio_path)
    user_text = result["text"]
    st.write("Transcribed Text:", user_text)

    # Generate AI Response
    st.write("Generating a conversational response...")
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )
    
    response = client.chat.completions.create(
        model='Meta-Llama-3.1-8B-Instruct',
        messages=[
            {"role": "system", "content": (
                "You are a kind, empathetic, and intelligent assistant capable of meaningful conversations and emotional support. "
                "Your primary goals are: "
                "1. To engage in casual, friendly, and supportive conversations when the user seeks companionship or emotional relief. "
                "2. To adapt your tone and responses to match the user's mood, providing warmth and encouragement if they seem distressed or seeking emotional support. "
                "3. To answer questions accurately and provide explanations when asked, adjusting the depth and length of your answers based on the user's needs. "
                "4. To maintain a positive and non-judgmental tone, offering helpful advice or lighthearted dialogue when appropriate. "
                "5. To ensure the user feels heard, understood, and valued during every interaction. "
                "If the user does not ask a question, keep the conversation engaging and meaningful by responding thoughtfully or with light humor where appropriate."
            )},
            {"role": "user", "content": user_text},
        ],
        temperature=0.1,
        top_p=0.1,
    )
    
    answer = response.choices[0].message.content
    st.write("Response:", answer)

    # Convert response text to speech using gTTS
    st.write("Converting the response to speech...")
    tts = gTTS(text=answer,  slow=False)
    response_audio_path = "final_response.mp3"
    tts.save(response_audio_path)

    # Play and download the response MP3
    st.audio(response_audio_path, format="audio/mp3")
    st.download_button(
        label="Download the Response",
        data=open(response_audio_path, "rb"),
        file_name="final_response.mp3",
        mime="audio/mpeg",
    )

    # Clean up temporary files
    os.remove(temp_audio_path)
    os.remove(response_audio_path)