Spaces:

agnixcode
/

voice_chatbot

Sleeping

App Files Files Community

Dua Rajper commited on Mar 3, 2025

Commit

284be95

verified ·

1 Parent(s): 5cce8b9

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -20

app.py CHANGED Viewed

@@ -6,18 +6,14 @@ from espnet2.bin.tts_inference import Text2Speech
 import soundfile as sf
 from pydub import AudioSegment
 import io
-from dotenv import load_dotenv
 from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
 import av
 import numpy as np
-# Load environment variables from .env file
-load_dotenv()
-# Load Groq API key from .env file
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 if not GROQ_API_KEY:
-    st.error("Groq API key not found. Please add it to the .env file.")
     st.stop()
 # Initialize Groq client
@@ -36,10 +32,8 @@ def load_models():
         feature_extractor=processor.feature_extractor,
         return_timestamps=True  # Enable timestamps for long-form audio
     )
     # Text-to-Speech
     tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
     return stt_pipe, tts_model
 stt_pipe, tts_model = load_models()
@@ -67,7 +61,6 @@ webrtc_ctx = webrtc_streamer(
 if webrtc_ctx.audio_processor:
     st.write("Recording... Press 'Stop' to finish recording.")
     # Save recorded audio to a WAV file
     if st.button("Stop and Process Recording"):
         audio_frames = webrtc_ctx.audio_processor.audio_frames
@@ -77,45 +70,35 @@ if webrtc_ctx.audio_processor:
             # Save as WAV file
             sf.write("recorded_audio.wav", audio_data, samplerate=16000)
             st.success("Recording saved as recorded_audio.wav")
             # Process the recorded audio
             speech, _ = sf.read("recorded_audio.wav")
             output = stt_pipe(speech)  # Transcribe with timestamps
             # Debug: Print the transcribed text
             st.write("Transcribed Text:", output['text'])
             # Display the text with timestamps (optional)
             if 'chunks' in output:
                 st.write("Transcribed Text with Timestamps:")
                 for chunk in output['chunks']:
                     st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
             # Generate response using Groq API
             try:
                 # Debug: Print the input text
                 st.write("Input Text:", output['text'])
                 chat_completion = groq_client.chat.completions.create(
                     messages=[{"role": "user", "content": output['text']}],
                     model="mixtral-8x7b-32768",
                     temperature=0.5,
-                    max_tokens=1024
                 )
                 # Debug: Print the API response
                 st.write("API Response:", chat_completion)
                 # Extract the generated response
                 response = chat_completion.choices[0].message.content
                 st.write("Generated Response:", response)
                 # Convert response to speech
                 speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
                 # Debug: Print the TTS output
                 st.write("TTS Output:", speech)
                 # Save and play the speech
                 sf.write("response.wav", speech, 22050)
                 st.audio("response.wav")

 import soundfile as sf
 from pydub import AudioSegment
 import io
 from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
 import av
 import numpy as np
+# Load Groq API key from environment variables
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 if not GROQ_API_KEY:
+    st.error("Groq API key not found. Please add it to the Hugging Face Space Secrets.")
     st.stop()
 # Initialize Groq client
         feature_extractor=processor.feature_extractor,
         return_timestamps=True  # Enable timestamps for long-form audio
     )
     # Text-to-Speech
     tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
     return stt_pipe, tts_model
 stt_pipe, tts_model = load_models()
 if webrtc_ctx.audio_processor:
     st.write("Recording... Press 'Stop' to finish recording.")
     # Save recorded audio to a WAV file
     if st.button("Stop and Process Recording"):
         audio_frames = webrtc_ctx.audio_processor.audio_frames
             # Save as WAV file
             sf.write("recorded_audio.wav", audio_data, samplerate=16000)
             st.success("Recording saved as recorded_audio.wav")
             # Process the recorded audio
             speech, _ = sf.read("recorded_audio.wav")
             output = stt_pipe(speech)  # Transcribe with timestamps
             # Debug: Print the transcribed text
             st.write("Transcribed Text:", output['text'])
             # Display the text with timestamps (optional)
             if 'chunks' in output:
                 st.write("Transcribed Text with Timestamps:")
                 for chunk in output['chunks']:
                     st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
             # Generate response using Groq API
             try:
                 # Debug: Print the input text
                 st.write("Input Text:", output['text'])
                 chat_completion = groq_client.chat.completions.create(
                     messages=[{"role": "user", "content": output['text']}],
                     model="mixtral-8x7b-32768",
                     temperature=0.5,
+                    max_tokens=1024,
                 )
                 # Debug: Print the API response
                 st.write("API Response:", chat_completion)
                 # Extract the generated response
                 response = chat_completion.choices[0].message.content
                 st.write("Generated Response:", response)
                 # Convert response to speech
                 speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
                 # Debug: Print the TTS output
                 st.write("TTS Output:", speech)
                 # Save and play the speech
                 sf.write("response.wav", speech, 22050)
                 st.audio("response.wav")