Spaces:

SmilyAI-Nova
/

Nova-Voice-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 29

Commit

73ec700

verified ·

1 Parent(s): b16b776

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +201 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,203 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import torch
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import numpy as np
+import time
+# --- 0. Streamlit App Configuration ---
+# Set the page configuration for a cleaner look
+st.set_page_config(
+    page_title="Nova Voice Chat (Streamlit)",
+    layout="centered",
+    initial_sidebar_state="collapsed"
+)
+# A custom component is needed for microphone recording in Streamlit
+try:
+    from st_audiorec import st_audiorec
+except ImportError:
+    st.error("Please install the st-audiorec component: `pip install st-audiorec`")
+    st.stop()
+# --- 1. Global Model Loading (Cached) ---
+@st.cache_resource
+def load_models():
+    """Loads all models and pipes, cached globally."""
+    with st.spinner("Loading AI models..."):
+        print("Loading models...")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # 1. Speech-to-Text (STT)
+        stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=device)
+        # 2. Large Language Model (LLM)
+        model_name = "Qwen/Qwen2-0.5B-Instruct"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        llm_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map="auto"
+        )
+        # 3. Text-to-Speech (TTS)
+        tts_pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device)
+        print("Models loaded.")
+        st.success("Models loaded!")
+        return stt_pipe, tokenizer, llm_model, tts_pipe
+# Load the models once
+try:
+    STT_PIPE, TOKENIZER, LLM_MODEL, TTS_PIPE = load_models()
+except Exception as e:
+    st.error(f"Failed to load models. Please check your hardware and dependencies. Error: {e}")
+    st.stop()
+# --- 2. State Initialization and Functions ---
+def get_initial_chat_history():
+    """Returns the initial chat history structure."""
+    return [
+        {
+            "role": "system",
+            "content": "You are Nova, an AI assistant. You are friendly and helpful. Respond naturally, as if in a conversation."
+        }
+    ]
+# Initialize session state for chat history
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = get_initial_chat_history()
+# Placeholder for the status text
+if 'status_text' not in st.session_state:
+    st.session_state.status_text = "I'm listening..."
+# Placeholder for the audio playback element
+if 'audio_to_play' not in st.session_state:
+    st.session_state.audio_to_play = None
+def process_audio_file(wav_audio_data):
+    """
+    Handles the entire voice interaction flow: STT -> LLM -> TTS.
+    This function is called when a recording is finished.
+    """
+    if wav_audio_data is None:
+        st.session_state.status_text = "Didn't catch that. Try again."
+        st.session_state.audio_to_play = None
+        st.rerun() # Rerun to update status
+        return
+    st.session_state.status_text = "Thinking..."
+    st.rerun()
+    try:
+        # Save the audio data to a temporary file for the STT pipe
+        # The Gradio version received a file path, st_audiorec gives raw bytes.
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            tmp_file.write(wav_audio_data)
+            audio_filepath = tmp_file.name
+        # 1. Speech-to-Text (STT)
+        result = STT_PIPE(audio_filepath)
+        transcript = result["text"].strip() if result and result["text"] else ""
+        if not transcript:
+            st.session_state.status_text = "I couldn't hear anything clearly."
+            st.session_state.audio_to_play = None
+            st.rerun()
+            return
+        # 2. LLM Inference
+        st.session_state.chat_history.append({"role": "user", "content": transcript})
+        # Manage context length (keep system prompt + last 9 exchanges)
+        if len(st.session_state.chat_history) > 10:
+            st.session_state.chat_history = [st.session_state.chat_history[0]] + st.session_state.chat_history[-9:]
+        text = TOKENIZER.apply_chat_template(st.session_state.chat_history, tokenize=False, add_generation_prompt=True)
+        model_inputs = TOKENIZER([text], return_tensors="pt").to(LLM_MODEL.device)
+        with torch.no_grad():
+            generated_ids = LLM_MODEL.generate(**model_inputs, max_new_tokens=256, pad_token_id=TOKENIZER.eos_token_id)
+        response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
+        response = TOKENIZER.decode(response_ids, skip_special_tokens=True).strip()
+        st.session_state.chat_history.append({"role": "assistant", "content": response})
+        # 3. Text-to-Speech (TTS)
+        st.session_state.status_text = "Responding..."
+        st.rerun()
+        speech = TTS_PIPE(response)
+        # Save audio data and trigger playback
+        audio_data = (speech["sampling_rate"], speech["audio"].astype(np.float32))
+        st.session_state.audio_to_play = audio_data
+        st.session_state.status_text = "I'm listening..."
+        st.rerun()
+    except Exception as e:
+        print(f"Error in process_audio: {e}")
+        st.session_state.status_text = "An error occurred."
+        st.session_state.audio_to_play = None
+        st.rerun()
+# --- 3. Streamlit Interface ---
+st.title("Nova Voice Chat 🎤")
+st.markdown("---")
+# The custom microphone recording component
+wav_audio_data = st_audiorec()
+# When a recording is completed, the component returns the audio data,
+# which triggers the processing function.
+if wav_audio_data is not None:
+    process_audio_file(wav_audio_data)
+# Status Text Display
+st.markdown(
+    f'<h2 style="text-align: center; color: #1A73E8;">{st.session_state.status_text}</h2>',
+    unsafe_allow_html=True
+)
+st.markdown(
+    '<p style="text-align: center; color: #8C8C8C; font-size: 14px;">Qwen2-0.5B-Instruct</p>',
+    unsafe_allow_html=True
+)
+# Audio Playback
+# Display the audio player only when there's new audio to play
+if st.session_state.audio_to_play is not None:
+    sampling_rate, audio_array = st.session_state.audio_to_play
+    st.audio(audio_array, sample_rate=sampling_rate)
+    # Reset the audio state after playback starts (or immediately, as Streamlit reruns)
+    st.session_state.audio_to_play = None
+# Optional: Display chat history in the sidebar
+with st.sidebar:
+    st.subheader("Chat History")
+    # Display the conversation (excluding the system prompt)
+    for message in st.session_state.chat_history[1:]:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+    if st.button("Reset Chat"):
+        st.session_state.chat_history = get_initial_chat_history()
+        st.session_state.status_text = "Chat reset. I'm listening..."
+        st.rerun()