Dua Rajper commited on
Commit
284be95
·
verified ·
1 Parent(s): 5cce8b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -20
app.py CHANGED
@@ -6,18 +6,14 @@ from espnet2.bin.tts_inference import Text2Speech
6
  import soundfile as sf
7
  from pydub import AudioSegment
8
  import io
9
- from dotenv import load_dotenv
10
  from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
11
  import av
12
  import numpy as np
13
 
14
- # Load environment variables from .env file
15
- load_dotenv()
16
-
17
- # Load Groq API key from .env file
18
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
19
  if not GROQ_API_KEY:
20
- st.error("Groq API key not found. Please add it to the .env file.")
21
  st.stop()
22
 
23
  # Initialize Groq client
@@ -36,10 +32,8 @@ def load_models():
36
  feature_extractor=processor.feature_extractor,
37
  return_timestamps=True # Enable timestamps for long-form audio
38
  )
39
-
40
  # Text-to-Speech
41
  tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
42
-
43
  return stt_pipe, tts_model
44
 
45
  stt_pipe, tts_model = load_models()
@@ -67,7 +61,6 @@ webrtc_ctx = webrtc_streamer(
67
 
68
  if webrtc_ctx.audio_processor:
69
  st.write("Recording... Press 'Stop' to finish recording.")
70
-
71
  # Save recorded audio to a WAV file
72
  if st.button("Stop and Process Recording"):
73
  audio_frames = webrtc_ctx.audio_processor.audio_frames
@@ -77,45 +70,35 @@ if webrtc_ctx.audio_processor:
77
  # Save as WAV file
78
  sf.write("recorded_audio.wav", audio_data, samplerate=16000)
79
  st.success("Recording saved as recorded_audio.wav")
80
-
81
  # Process the recorded audio
82
  speech, _ = sf.read("recorded_audio.wav")
83
  output = stt_pipe(speech) # Transcribe with timestamps
84
-
85
  # Debug: Print the transcribed text
86
  st.write("Transcribed Text:", output['text'])
87
-
88
  # Display the text with timestamps (optional)
89
  if 'chunks' in output:
90
  st.write("Transcribed Text with Timestamps:")
91
  for chunk in output['chunks']:
92
  st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
93
-
94
  # Generate response using Groq API
95
  try:
96
  # Debug: Print the input text
97
  st.write("Input Text:", output['text'])
98
-
99
  chat_completion = groq_client.chat.completions.create(
100
  messages=[{"role": "user", "content": output['text']}],
101
  model="mixtral-8x7b-32768",
102
  temperature=0.5,
103
- max_tokens=1024
104
  )
105
-
106
  # Debug: Print the API response
107
  st.write("API Response:", chat_completion)
108
-
109
  # Extract the generated response
110
  response = chat_completion.choices[0].message.content
111
  st.write("Generated Response:", response)
112
-
113
  # Convert response to speech
114
  speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding
115
-
116
  # Debug: Print the TTS output
117
  st.write("TTS Output:", speech)
118
-
119
  # Save and play the speech
120
  sf.write("response.wav", speech, 22050)
121
  st.audio("response.wav")
 
6
  import soundfile as sf
7
  from pydub import AudioSegment
8
  import io
 
9
  from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
10
  import av
11
  import numpy as np
12
 
13
+ # Load Groq API key from environment variables
 
 
 
14
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
15
  if not GROQ_API_KEY:
16
+ st.error("Groq API key not found. Please add it to the Hugging Face Space Secrets.")
17
  st.stop()
18
 
19
  # Initialize Groq client
 
32
  feature_extractor=processor.feature_extractor,
33
  return_timestamps=True # Enable timestamps for long-form audio
34
  )
 
35
  # Text-to-Speech
36
  tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
 
37
  return stt_pipe, tts_model
38
 
39
  stt_pipe, tts_model = load_models()
 
61
 
62
  if webrtc_ctx.audio_processor:
63
  st.write("Recording... Press 'Stop' to finish recording.")
 
64
  # Save recorded audio to a WAV file
65
  if st.button("Stop and Process Recording"):
66
  audio_frames = webrtc_ctx.audio_processor.audio_frames
 
70
  # Save as WAV file
71
  sf.write("recorded_audio.wav", audio_data, samplerate=16000)
72
  st.success("Recording saved as recorded_audio.wav")
 
73
  # Process the recorded audio
74
  speech, _ = sf.read("recorded_audio.wav")
75
  output = stt_pipe(speech) # Transcribe with timestamps
 
76
  # Debug: Print the transcribed text
77
  st.write("Transcribed Text:", output['text'])
 
78
  # Display the text with timestamps (optional)
79
  if 'chunks' in output:
80
  st.write("Transcribed Text with Timestamps:")
81
  for chunk in output['chunks']:
82
  st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
 
83
  # Generate response using Groq API
84
  try:
85
  # Debug: Print the input text
86
  st.write("Input Text:", output['text'])
 
87
  chat_completion = groq_client.chat.completions.create(
88
  messages=[{"role": "user", "content": output['text']}],
89
  model="mixtral-8x7b-32768",
90
  temperature=0.5,
91
+ max_tokens=1024,
92
  )
 
93
  # Debug: Print the API response
94
  st.write("API Response:", chat_completion)
 
95
  # Extract the generated response
96
  response = chat_completion.choices[0].message.content
97
  st.write("Generated Response:", response)
 
98
  # Convert response to speech
99
  speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding
 
100
  # Debug: Print the TTS output
101
  st.write("TTS Output:", speech)
 
102
  # Save and play the speech
103
  sf.write("response.wav", speech, 22050)
104
  st.audio("response.wav")